CUB block reduce a custom vector type

April 18, 2023

I am trying to sum a few floats across a thread block using the CUB library in CUDA. I know how to reduce a single float (using a 2D thread block in this case):

float sum = globalInput[threadIdx.x + threadIdx.y * blockDim.x];

typedef cub::BlockReduce<float, THREAD_BLOCK_X, cub::BLOCK_REDUCE_RAKING, THREAD_BLOCK_Y> BlockReduce;

// Allocate shared memory for BlockReduce
__shared__ typename BlockReduce::TempStorage temp_storage;
  
// Compute the block-wide sum for thread0
float aggregate = BlockReduce(temp_storage).Sum(sum);

Is it possible to generalize this algorithm to work with vector types such as float2, float4, etc.? I suppose I could reduce one float at a time, but that would involve multiple underlying __syncthreads() that wouldn’t be necessary in a composite reduction.

>Solution :

CUDA doesn’t natively provide operators for the "built-in" vector types. So pretty much the only thing needed to extend the basic example for e.g. float2 is to provide an appropriate reduction operator:

$ cat t2259.cu
#include <cub/cub.cuh>
__host__ __device__
float2 operator+(const float2 a, const float2 b){
  float2 result;
  result.x = a.x+b.x;
  result.y = a.y+b.y;
  return result;
}
__global__ void ExampleKernel()
{
    // Specialize BlockReduce for a 1D block of 128 threads of type float2
    typedef cub::BlockReduce<float2, 128> BlockReduce;
    // Allocate shared memory for BlockReduce
    __shared__ typename BlockReduce::TempStorage temp_storage;
    // Obtain a segment of consecutive items that are blocked across threads
    float2 thread_data[4];
    thread_data[0].x = 1.0f;
    thread_data[0].y = 2.0f;
    thread_data[1].x = 2.0f;
    thread_data[1].y = 3.0f;
    thread_data[2].x = 1.0f;
    thread_data[2].y = 2.0f;
    thread_data[3].x = 2.0f;
    thread_data[3].y = 3.0f;

    // Compute the block-wide sum for thread0
    float2 aggregate = BlockReduce(temp_storage).Sum(thread_data);
    if (!threadIdx.x) printf("aggregate.x: %f .y: %f\n", aggregate.x, aggregate.y);
}


int main(){

  ExampleKernel<<<1,128>>>();
  cudaDeviceSynchronize();
}
$ nvcc -o t2259 t2259.cu
$ compute-sanitizer ./t2259
========= COMPUTE-SANITIZER
aggregate.x: 768.000000 .y: 1280.000000
========= ERROR SUMMARY: 0 errors
$