I am trying to sum a few floats across a thread block using the CUB library in CUDA. I know how to reduce a single float (using a 2D thread block in this case):
float sum = globalInput[threadIdx.x + threadIdx.y * blockDim.x];
typedef cub::BlockReduce<float, THREAD_BLOCK_X, cub::BLOCK_REDUCE_RAKING, THREAD_BLOCK_Y> BlockReduce;
// Allocate shared memory for BlockReduce
__shared__ typename BlockReduce::TempStorage temp_storage;
// Compute the block-wide sum for thread0
float aggregate = BlockReduce(temp_storage).Sum(sum);
Is it possible to generalize this algorithm to work with vector types such as float2, float4, etc.? I suppose I could reduce one float at a time, but that would involve multiple underlying __syncthreads() that wouldn’t be necessary in a composite reduction.
>Solution :
CUDA doesn’t natively provide operators for the "built-in" vector types. So pretty much the only thing needed to extend the basic example for e.g. float2 is to provide an appropriate reduction operator:
$ cat t2259.cu
#include <cub/cub.cuh>
__host__ __device__
float2 operator+(const float2 a, const float2 b){
float2 result;
result.x = a.x+b.x;
result.y = a.y+b.y;
return result;
}
__global__ void ExampleKernel()
{
// Specialize BlockReduce for a 1D block of 128 threads of type float2
typedef cub::BlockReduce<float2, 128> BlockReduce;
// Allocate shared memory for BlockReduce
__shared__ typename BlockReduce::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
float2 thread_data[4];
thread_data[0].x = 1.0f;
thread_data[0].y = 2.0f;
thread_data[1].x = 2.0f;
thread_data[1].y = 3.0f;
thread_data[2].x = 1.0f;
thread_data[2].y = 2.0f;
thread_data[3].x = 2.0f;
thread_data[3].y = 3.0f;
// Compute the block-wide sum for thread0
float2 aggregate = BlockReduce(temp_storage).Sum(thread_data);
if (!threadIdx.x) printf("aggregate.x: %f .y: %f\n", aggregate.x, aggregate.y);
}
int main(){
ExampleKernel<<<1,128>>>();
cudaDeviceSynchronize();
}
$ nvcc -o t2259 t2259.cu
$ compute-sanitizer ./t2259
========= COMPUTE-SANITIZER
aggregate.x: 768.000000 .y: 1280.000000
========= ERROR SUMMARY: 0 errors
$