I've implemented a two-step convolution in OpenCL running on GPUs. The convolution is applied to a series of 1480x1552 matrices. All matrices are pre-loaded and are stored in the input_image array. With my current implementation I'm able to achieve a processing rate of ~80 images per second.
How, if possible, can I improve my code in order to increase the throughput?
Here is my code to enqueue the kernels for each matrix:
cl_device_id device;
cl_context context;
cl_int err;
cl_program program;
cl_kernel noise_kernel, sobel_kernel;
cl_command_queue queue;
cl_mem image_buffer;
cl_mem filter_buffer;
cl_mem output_buffer;
cl_mem width_buffer;
cl_mem brightness_buffer;
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
program = build_program(context, device, "my_convolution.cl");
noise_kernel = clCreateKernel(program, "convolute_unrolled", &err);
sobel_kernel = clCreateKernel(program, "sobel", &err);
filter_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(filter), filter, &err);
output_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, IMAGE_SIZE, NULL, &err);
width_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(width), &width, &err);
err = clSetKernelArg(noise_kernel, 1, sizeof(cl_mem), &filter_buffer);
err = clSetKernelArg(noise_kernel, 2, sizeof(cl_mem), &output_buffer);
err = clSetKernelArg(noise_kernel, 3, sizeof(cl_mem), &width_buffer);
err = clSetKernelArg(sobel_kernel, 1, sizeof(cl_mem), &output_buffer);
err = clSetKernelArg(sobel_kernel, 2, sizeof(cl_mem), &width_buffer);
begin = clock();
for (int i = 0; i < line_count; ++i){
// remove single pixel noise
image_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, IMAGE_SIZE, input_data[i], &err);
err = clSetKernelArg(noise_kernel, 0, sizeof(cl_mem), &image_buffer);
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
size_t work_items[2] = {DIM_Y - 2, DIM_X - 2};
err = clEnqueueNDRangeKernel(queue, noise_kernel, 2, NULL, &work_items, NULL, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, IMAGE_SIZE, input_data[i], 0, NULL, NULL);
// apply sobel operator
err = clSetKernelArg(sobel_kernel, 0, sizeof(cl_mem), &image_buffer);
err = clEnqueueNDRangeKernel(queue, sobel_kernel, 2, NULL, &work_items, NULL, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, IMAGE_SIZE, input_data[i], 0, NULL, NULL);
}
clFinish(queue);
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time spent on GPU: %f\n", time_spent);
And here are the kernels I'd like to apply to the matrices:
__kernel void convolute_unrolled(const __global short * image, __constant float * filter, __global short * output, __global int * width) {
int row = get_global_id(0) + 1, counter;
float accumulator;
int pixel = row * get_global_id(1);
accumulator = 0.0;
accumulator += image[pixel - 1 - 1 * *width] * filter[counter];
accumulator += image[pixel - 1 * *width] * filter[counter];
accumulator += image[pixel + 1 - 1 * *width] * filter[counter];
accumulator += image[pixel - 1 ] * filter[counter];
accumulator += image[pixel] * filter[counter];
accumulator += image[pixel + 1 ] * filter[counter];
accumulator += image[pixel - 1 + 1 * *width] * filter[counter];
accumulator += image[pixel + 1 * *width] * filter[counter];
accumulator += image[pixel + 1 + 1 * *width] * filter[counter];
output[pixel] = (short) accumulator / 9.0;
}
__kernel void sobel(const __global short * image, __global short * output, __global int * width) {
short sobel_x[9] = {-1, -2, -1, 0, 0, 0, 1, 2, 1};
short sobel_y[9] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
float aX, aY;
int row = get_global_id(0) + 1, counter;
int pixel = row * get_global_id(1);
counter = 0;
aX = 0.0;
aY = 0.0;
aX += image[pixel - 1 - *width] * sobel_x[counter];
aY += image[pixel - 1 - *width] * sobel_y[counter];
aX += image[pixel - *width] * sobel_x[counter];
aY += image[pixel - *width] * sobel_y[counter];
aX += image[pixel + 1 - *width] * sobel_x[counter];
aY += image[pixel + 1 - *width] * sobel_y[counter];
aX += image[pixel - 1 ] * sobel_x[counter];
aY += image[pixel - 1 ] * sobel_y[counter];
aX += image[pixel] * sobel_x[counter];
aY += image[pixel] * sobel_y[counter];
aX += image[pixel + 1] * sobel_x[counter];
aY += image[pixel + 1] * sobel_y[counter];
aX += image[pixel - 1 + *width] * sobel_x[counter];
aY += image[pixel - 1 + *width] * sobel_y[counter];
aX += image[pixel + *width] * sobel_x[counter];
aY += image[pixel + *width] * sobel_y[counter];
aX += image[pixel + 1 + *width] * sobel_x[counter];
aY += image[pixel + 1 + *width] * sobel_y[counter];
++counter;
output[pixel] = (short) sqrt(pow(aX, 2) + pow(aY, 2));
}