Performance gaps tiled v/s naive matrix multiplication in CUDA C++

April 16, 2022

I’m trying out the difference between using a tiled and naive implementation in CUDA C++. I expect to see a performance gap in these variations because of the repeated usage of shared memory. However, the speedup was only about twice as fast (naive ~12ms and tiled ~6ms). Here are the code snippets:

#include <iostream>
#include <assert.h>
using namespace std;

# define N 1024
# define THREADS 16
# define IDX(x, y, s) (x*s + y)

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void init_values(int *a, int *b, int sz) {
    for(int i=0; i<sz; i++) {
        a[i] = rand()%513 - 256;
        b[i] = rand()%513 - 256;
    }
}

__global__ 
void matmul(int *a, int *b, int *c, int n) {
    // perform parallel matmul
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int t = 0;
    for(int i=0; i<n; i++) {
        t += (a[IDX(x, i, n)] * b[IDX(i, y, n)]);
    }
    c[IDX(x, y, n)] = t;
}

void matmul_verify(int *a, int *b, int *c, int n) {
    for(int i=0; i<n; i++) {
        for(int j=0; j<n; j++) {
            int t = 0;
            for(int k=0; k<n; k++)
                t += a[IDX(i, k, n)] * b[IDX(k, j, n)];
            // cout << i << " " << j << " " << c[IDX(i, j, n)] <<  " " << t <<  endl;
            assert(c[IDX(i, j, n)] == t);
        }
    }
}

int main()
{
    int *a, *b, *c;
    int *da, *db, *dc;
    size_t sz = N * N * sizeof(int);

    a = (int*)malloc(sz);
    b = (int*)malloc(sz);
    c = (int*)malloc(sz);
    init_values(a, b, N*N);

    gpuErrchk(cudaMalloc((void**)&da, sz));
    gpuErrchk(cudaMalloc((void**)&db, sz));
    gpuErrchk(cudaMalloc((void**)&dc, sz));

    gpuErrchk(cudaMemcpy(da, a, sz, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(db, b, sz, cudaMemcpyHostToDevice));
    // init grid size
    dim3 grids(N/THREADS, N/THREADS);
    dim3 blocks(THREADS, THREADS);
    // time it
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    matmul<<<grids, blocks>>>(da, db, dc, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cout << "Took " << milliseconds << " milliseconds.\n";

    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaMemcpy(c, dc, sz, cudaMemcpyDeviceToHost));
    matmul_verify(a, b, c, N);

    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    free(a);
    free(b);
    free(c);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

and for the tiled implementation, I change the kernel as

__global__ 
void matmul(int *a, int *b, int *c, int n) {
    // perform parallel matmul
    int ty = threadIdx.y, by = blockIdx.y;
    int tx = threadIdx.x, bx = blockIdx.x;
    int x = bx * blockDim.x + tx;
    int y = by * blockDim.y + ty;
    // block IDs tell us which block to solve for
    // (bx, by) --> (bx: bx + tx, by:by + ty)
    __shared__ int A[SHMEM_SIZE];
    __shared__ int B[SHMEM_SIZE];

    const int tile_size = THREADS;

    // to get value of tile [tx, ty] in block [bx, by], we need blocks A[bx, *] and blocks B[*, by]
    int res = 0;

    for(int blk=0; blk < n; blk+=tile_size)  {
        // block index
        A[IDX(tx, ty, tile_size)] = a[IDX(x, blk + ty, n)];
        B[IDX(tx, ty, tile_size)] = b[IDX(blk + tx, y, n)];
        __syncthreads();
        for(int k=0; k<tile_size; k++) {
            res += (A[IDX(tx, k, tile_size)] * B[IDX(k, ty, tile_size)]);
        }
        __syncthreads();
    }

    // for(int k=0; k<n; k++)
    //     res += a[IDX(x, k, n)] * b[IDX(k, y, n)];
    c[IDX(x, y, n)] = res;
}

nothing else really changes. However, in the tiled implementation, if I simply change

    int ty = threadIdx.x, by = blockIdx.x;
    int tx = threadIdx.y, bx = blockIdx.y;

for the initialization of thread and block indices, I get about a ~1ms runtime (12x speedup). How is this happening? I read from the book "CUDA By Example" that the thread and block indices in 2 dimensions are just for programmer convenience and do not reflect any difference in performance. This seems to be false. Any clarification is really appreciated.

>Solution :

CUDA thread blocks are partitioned into warps of 32 threads. Ideally the neighboring lanes of a warp should always load neighboring elements from global memory. This is called coalescing and allows for maximum memory bandwidth (at least when the loads are sized right; one can try to use the builtin vector types to get bigger loads for optimization, e.g. int2, int4, float2, etc.). In hardware all the coalesced loads from a warp will be bundled into a minimal number of memory transactions.

The mapping from 3D threadIdx to warp lanes always takes the first dimension .x as the continuous dimension, i.e. a block of dimensions (32, 2, 1) will have one warp with threadIdx.y == 0 and one warp with threadIdx.y == 1 where the lanes of each warp correspond to threadIdx.x.

Therefore to allow for coalescing, you have to access memory as

A[ty * s + tx] // coalesced access

instead of

A[tx * s + ty] // strided access

to achieve optimal performance.

What is probably meant in the book you mentioned is that there shouldn’t be a performance difference between launching a grid of (32, 2, 1) blocks and a grid of (64, 1, 1) blocks while manually getting ty = threadIdx.x / 32 and tx = threadIdx.x % 32. This is probably what happens internally when having a block that is not flat in the first place.