I implemented a reduce kernel in OpenCL to sum up all entries in the input vector of size N. For a easier testing I initialize the input vector with 1.0f. So the result should be N. But it is not!
Here is my reduce-kernel:
kernel void reduce(global float* input, global float* output, const unsigned int N, local float* cache)
{
const uint local_id = get_local_id(0);
const uint global_id = get_global_id(0);
const uint local_size = get_local_size(0);
cache[local_id] = (global_id < N) ? input[global_id] : 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int s = local_size >> 1; s > 0; s >>= 1) {
if (local_id < s) {
cache[local_id] += cache[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_id == 0) output[local_size] = cache[0];
}
And here is the setting for OpenCL:
const uint N = 8196;
cl_float a[N];
cl_float b[N];
for (uint i=0; i<N; i++) {
a[i] = 1.0f;
b[i] = 0.0f;
}
cl::Buffer inputBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float)*N);
cl::Buffer resultBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float)*N);
queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, sizeof(cl_float)*N, a);
queue.enqueueWriteBuffer(resultBuffer, CL_TRUE, 0, sizeof(cl_float)*N, b);
cl::Kernel addVectorKernel = cl::Kernel(program, "reduce");
size_t localSize = addVectorKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device); // e.g. => 512
size_t globalSize = roundUp(localSize, N); // rounds up to a multiple of localSize
addVectorKernel.setArg(0, inputBuffer);
addVectorKernel.setArg(1, resultBuffer);
addVectorKernel.setArg(2, N);
addVectorKernel.setArg(3, (sizeof(cl_float) * localSize), NULL);
queue.enqueueNDRangeKernel(
addVectorKernel,
cl::NullRange,
cl::NDRange(globalSize),
cl::NDRange(localSize)
);
queue.finish(); // wait for ending
queue.enqueueReadBuffer(resultBuffer, CL_TRUE, 0, sizeof(cl_float)*N, b); // e.g. => 1024
The result depends on the workgroup size. What am I doing wrong? Is it the kernel itself or is it the settings for OpenCL?