I implemented a kernel that performs a single-pass scan (proposed in the book Programming Massively Parallel Processors):
__global__
void SinglePassKoggeStoneScan(const unsigned int * input, unsigned int * output,
const unsigned int length, unsigned int * flags, unsigned int * scanValue, unsigned int * blockCounter) {
__shared__ unsigned int bid_s;
__shared__ unsigned int XY[SECTION_SIZE];
if (threadIdx.x == 0) {
bid_s = atomicAdd(blockCounter, 1);
}
__syncthreads();
int bid = bid_s;
int idx = bid * blockDim.x + threadIdx.x;
if (idx < length) {
XY[threadIdx.x] = input[idx];
} else {
XY[threadIdx.x] = 0;
}
__syncthreads();
for (int stride = 1; stride < SECTION_SIZE; stride *= 2) {
__syncthreads();
float tmp = 0;
if (threadIdx.x >= stride) {
tmp = XY[threadIdx.x] + XY[threadIdx.x - stride];
}
__syncthreads();
if (threadIdx.x >= stride) {
XY[threadIdx.x] = tmp;
}
}
__syncthreads();
__shared__ unsigned int previousSum;
if (threadIdx.x == 0) {
while (bid >= 1 && atomicAdd( & flags[bid], 0) == 0) {} // Wait for data
previousSum = scanValue[bid];
scanValue[bid + 1] = XY[blockDim.x - 1] + previousSum;
__threadfence();
atomicAdd( & flags[bid + 1], 1);
}
__syncthreads();
if (idx < length) {
output[idx] = XY[threadIdx.x] + previousSum;
}
}
I would like to extend this kernel so that it performs a scan on each row of a matrix independently.
Currently, I can implement a naive solution by executing this code:
#define SECTION_SIZE 1024
unsigned int input[] = {1,2,3,4,5,6,7,8,9};
const unsigned int width = 3;
const unsigned int height = 3;
unsigned int* output = new unsigned int[3*3];
unsigned int *deviceInput, *deviceOutput, *flags, *scanValue, *blockCounter;
const size_t imageSize = width * height * sizeof(unsigned int);
const unsigned int scanValueNum = (width + SECTION_SIZE - 1) / SECTION_SIZE;
const size_t scanValueSize = scanValueNum * sizeof(unsigned int);
cudaMalloc(reinterpret_cast<void**>(&deviceInput), imageSize);
cudaMalloc(reinterpret_cast<void**>(&deviceOutput), imageSize);
cudaMalloc(reinterpret_cast<void**>(&flags), scanValueSize);
cudaMalloc(reinterpret_cast<void**>(&scanValue), scanValueSize);
cudaMalloc(reinterpret_cast<void**>(&blockCounter), sizeof(unsigned int));
cudaMemcpy(deviceInput, input, imageSize, cudaMemcpyHostToDevice);
dim3 blockDim(SECTION_SIZE);
dim3 gridDim(scanValueSize);
for(int i = 0; i < height; i++){
cudaMemset(flags, 0, scanValueSize);
cudaMemset(blockCounter, 0, sizeof(unsigned int));
SinglePassKoggeStoneScan<<<gridDim, blockDim>>>(deviceInput + i*width, deviceOutput + i*width, width, flags, scanValue, blockCounter);
}
cudaMemcpy(output, deviceOutput, imageSize, cudaMemcpyDeviceToHost);
cudaFree(deviceInput);
cudaFree(deviceOutput);
cudaFree(flags);
cudaFree(scanValue);
cudaFree(blockCounter);
However, I am having trouble adapting the code to directly handle this case. Any help or guidance would be greatly appreciated. Thanks in advance!
cub::DeviceScan::ExclusiveSumByKey()with a fancy iterator for the keys?XxxByKey/xxx_by_keyoperations generally much slower than just summing groups manually (especially compared to CUB block scan) though certainly simpler. Tracking keys is what make the operation so expensive. Segmented operations are faster, but I expect them to be slower than doing this manually (last time I ttried, few years ago, segmented operations like segmented sort was significantly more expensive than non-segmented ones or manual segmentation using 1 block per segment, assuming it is large enough).