Follow

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use
Contact

Performance gaps tiled v/s naive matrix multiplication in CUDA C++

I’m trying out the difference between using a tiled and naive implementation in CUDA C++. I expect to see a performance gap in these variations because of the repeated usage of shared memory. However, the speedup was only about twice as fast (naive ~12ms and tiled ~6ms). Here are the code snippets:

#include <iostream>
#include <assert.h>
using namespace std;

# define N 1024
# define THREADS 16
# define IDX(x, y, s) (x*s + y)

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void init_values(int *a, int *b, int sz) {
    for(int i=0; i<sz; i++) {
        a[i] = rand()%513 - 256;
        b[i] = rand()%513 - 256;
    }
}

__global__ 
void matmul(int *a, int *b, int *c, int n) {
    // perform parallel matmul
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int t = 0;
    for(int i=0; i<n; i++) {
        t += (a[IDX(x, i, n)] * b[IDX(i, y, n)]);
    }
    c[IDX(x, y, n)] = t;
}

void matmul_verify(int *a, int *b, int *c, int n) {
    for(int i=0; i<n; i++) {
        for(int j=0; j<n; j++) {
            int t = 0;
            for(int k=0; k<n; k++)
                t += a[IDX(i, k, n)] * b[IDX(k, j, n)];
            // cout << i << " " << j << " " << c[IDX(i, j, n)] <<  " " << t <<  endl;
            assert(c[IDX(i, j, n)] == t);
        }
    }
}

int main()
{
    int *a, *b, *c;
    int *da, *db, *dc;
    size_t sz = N * N * sizeof(int);

    a = (int*)malloc(sz);
    b = (int*)malloc(sz);
    c = (int*)malloc(sz);
    init_values(a, b, N*N);

    gpuErrchk(cudaMalloc((void**)&da, sz));
    gpuErrchk(cudaMalloc((void**)&db, sz));
    gpuErrchk(cudaMalloc((void**)&dc, sz));

    gpuErrchk(cudaMemcpy(da, a, sz, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(db, b, sz, cudaMemcpyHostToDevice));
    // init grid size
    dim3 grids(N/THREADS, N/THREADS);
    dim3 blocks(THREADS, THREADS);
    // time it
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    matmul<<<grids, blocks>>>(da, db, dc, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cout << "Took " << milliseconds << " milliseconds.\n";

    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaMemcpy(c, dc, sz, cudaMemcpyDeviceToHost));
    matmul_verify(a, b, c, N);

    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    free(a);
    free(b);
    free(c);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

and for the tiled implementation, I change the kernel as

__global__ 
void matmul(int *a, int *b, int *c, int n) {
    // perform parallel matmul
    int ty = threadIdx.y, by = blockIdx.y;
    int tx = threadIdx.x, bx = blockIdx.x;
    int x = bx * blockDim.x + tx;
    int y = by * blockDim.y + ty;
    // block IDs tell us which block to solve for
    // (bx, by) --> (bx: bx + tx, by:by + ty)
    __shared__ int A[SHMEM_SIZE];
    __shared__ int B[SHMEM_SIZE];

    const int tile_size = THREADS;

    // to get value of tile [tx, ty] in block [bx, by], we need blocks A[bx, *] and blocks B[*, by]
    int res = 0;

    for(int blk=0; blk < n; blk+=tile_size)  {
        // block index
        A[IDX(tx, ty, tile_size)] = a[IDX(x, blk + ty, n)];
        B[IDX(tx, ty, tile_size)] = b[IDX(blk + tx, y, n)];
        __syncthreads();
        for(int k=0; k<tile_size; k++) {
            res += (A[IDX(tx, k, tile_size)] * B[IDX(k, ty, tile_size)]);
        }
        __syncthreads();
    }

    // for(int k=0; k<n; k++)
    //     res += a[IDX(x, k, n)] * b[IDX(k, y, n)];
    c[IDX(x, y, n)] = res;
}    

nothing else really changes. However, in the tiled implementation, if I simply change

MEDevel.com: Open-source for Healthcare and Education

Collecting and validating open-source software for healthcare, education, enterprise, development, medical imaging, medical records, and digital pathology.

Visit Medevel

    int ty = threadIdx.x, by = blockIdx.x;
    int tx = threadIdx.y, bx = blockIdx.y;

for the initialization of thread and block indices, I get about a ~1ms runtime (12x speedup). How is this happening? I read from the book "CUDA By Example" that the thread and block indices in 2 dimensions are just for programmer convenience and do not reflect any difference in performance. This seems to be false. Any clarification is really appreciated.

>Solution :

CUDA thread blocks are partitioned into warps of 32 threads. Ideally the neighboring lanes of a warp should always load neighboring elements from global memory. This is called coalescing and allows for maximum memory bandwidth (at least when the loads are sized right; one can try to use the builtin vector types to get bigger loads for optimization, e.g. int2, int4, float2, etc.). In hardware all the coalesced loads from a warp will be bundled into a minimal number of memory transactions.

The mapping from 3D threadIdx to warp lanes always takes the first dimension .x as the continuous dimension, i.e. a block of dimensions (32, 2, 1) will have one warp with threadIdx.y == 0 and one warp with threadIdx.y == 1 where the lanes of each warp correspond to threadIdx.x.

Therefore to allow for coalescing, you have to access memory as

A[ty * s + tx] // coalesced access

instead of

A[tx * s + ty] // strided access

to achieve optimal performance.

What is probably meant in the book you mentioned is that there shouldn’t be a performance difference between launching a grid of (32, 2, 1) blocks and a grid of (64, 1, 1) blocks while manually getting ty = threadIdx.x / 32 and tx = threadIdx.x % 32. This is probably what happens internally when having a block that is not flat in the first place.

Add a comment

Leave a Reply

Keep Up to Date with the Most Important News

By pressing the Subscribe button, you confirm that you have read and are agreeing to our Privacy Policy and Terms of Use

Discover more from Dev solutions

Subscribe now to keep reading and get access to the full archive.

Continue reading