Two-step OpenCL convolution for series of matrices

Question

I've implemented a two-step convolution in OpenCL running on GPUs. The convolution is applied to a series of 1480x1552 matrices. All matrices are pre-loaded and are stored in the input_image array. With my current implementation I'm able to achieve a processing rate of ~80 images per second.

How, if possible, can I improve my code in order to increase the throughput?

Here is my code to enqueue the kernels for each matrix:

cl_device_id device;
cl_context context;
cl_int err;
cl_program program;
cl_kernel noise_kernel, sobel_kernel;
cl_command_queue queue;
cl_mem image_buffer;
cl_mem filter_buffer;
cl_mem output_buffer;
cl_mem width_buffer;
cl_mem brightness_buffer;

device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);

program = build_program(context, device, "my_convolution.cl");
noise_kernel = clCreateKernel(program, "convolute_unrolled", &err);
sobel_kernel = clCreateKernel(program, "sobel", &err);

filter_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(filter), filter, &err);
output_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, IMAGE_SIZE, NULL, &err);
width_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(width), &width, &err);

err = clSetKernelArg(noise_kernel, 1, sizeof(cl_mem), &filter_buffer);
err = clSetKernelArg(noise_kernel, 2, sizeof(cl_mem), &output_buffer);
err = clSetKernelArg(noise_kernel, 3, sizeof(cl_mem), &width_buffer);

err = clSetKernelArg(sobel_kernel, 1, sizeof(cl_mem), &output_buffer);
err = clSetKernelArg(sobel_kernel, 2, sizeof(cl_mem), &width_buffer);

begin = clock();
for (int i = 0; i < line_count; ++i){
    // remove single pixel noise
    image_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, IMAGE_SIZE, input_data[i], &err);
    err = clSetKernelArg(noise_kernel, 0, sizeof(cl_mem), &image_buffer);

    queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
    size_t work_items[2] = {DIM_Y - 2, DIM_X - 2};

    err = clEnqueueNDRangeKernel(queue, noise_kernel, 2, NULL, &work_items, NULL, 0, NULL, NULL);

    err = clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, IMAGE_SIZE, input_data[i], 0, NULL, NULL);

    // apply sobel operator
    err = clSetKernelArg(sobel_kernel, 0, sizeof(cl_mem), &image_buffer);
    err = clEnqueueNDRangeKernel(queue, sobel_kernel, 2, NULL, &work_items, NULL, 0, NULL, NULL);

    err = clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, IMAGE_SIZE, input_data[i], 0, NULL, NULL);
}
clFinish(queue);
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time spent on GPU: %f\n", time_spent);

And here are the kernels I'd like to apply to the matrices:

__kernel void convolute_unrolled(const __global short * image, __constant float * filter, __global short * output, __global int * width) {
  int row = get_global_id(0) + 1, counter;
  float accumulator;

    int pixel = row * get_global_id(1);
    accumulator = 0.0;

    accumulator += image[pixel - 1 - 1 * *width] * filter[counter];
    accumulator += image[pixel - 1 * *width] * filter[counter];
    accumulator += image[pixel + 1 - 1 * *width] * filter[counter];
    accumulator += image[pixel - 1 ] * filter[counter];
    accumulator += image[pixel] * filter[counter];
    accumulator += image[pixel + 1 ] * filter[counter];
    accumulator += image[pixel - 1 + 1 * *width] * filter[counter];
    accumulator += image[pixel + 1 * *width] * filter[counter];
    accumulator += image[pixel + 1 + 1 * *width] * filter[counter];

    output[pixel] = (short) accumulator / 9.0;
}

__kernel void sobel(const __global short * image, __global short * output, __global int * width) {
  short sobel_x[9] = {-1, -2, -1, 0, 0, 0, 1, 2, 1};
  short sobel_y[9] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
  float aX, aY;
  int row = get_global_id(0) + 1, counter;

    int pixel = row * get_global_id(1);
    counter = 0;
    aX = 0.0;
    aY = 0.0;
        aX += image[pixel - 1 - *width] * sobel_x[counter];
        aY += image[pixel - 1 - *width] * sobel_y[counter];

        aX += image[pixel - *width] * sobel_x[counter];
        aY += image[pixel - *width] * sobel_y[counter];

        aX += image[pixel + 1 - *width] * sobel_x[counter];
        aY += image[pixel + 1 - *width] * sobel_y[counter];

        aX += image[pixel - 1 ] * sobel_x[counter];
        aY += image[pixel - 1 ] * sobel_y[counter];

        aX += image[pixel] * sobel_x[counter];
        aY += image[pixel] * sobel_y[counter];

        aX += image[pixel + 1] * sobel_x[counter];
        aY += image[pixel + 1] * sobel_y[counter];

        aX += image[pixel - 1 + *width] * sobel_x[counter];
        aY += image[pixel - 1 + *width] * sobel_y[counter];

        aX += image[pixel + *width] * sobel_x[counter];
        aY += image[pixel + *width] * sobel_y[counter];

        aX += image[pixel + 1 + *width] * sobel_x[counter];
        aY += image[pixel + 1 + *width] * sobel_y[counter];

        ++counter;
    output[pixel] = (short) sqrt(pow(aX, 2) + pow(aY, 2));
  }

Questions on this site should only state the code's purpose, not what is requested from a review. — Jamal
– Jamal, Commented Dec 19, 2015 at 3:37

Gilles · Accepted Answer · 2015-10-02 07:22:56Z

Ok, first one remark: you use clock() for timing your code, and unless you have a very compelling reason for doing so, I'd say this is probably a bad idea, since it counts the CPU time of the current process and all sub-processes launched by it. This might lead to very unexpected timing reports whereas you have plenty of other possibilities to get reliable timing informations... You can consider for example clGetEventProfilingInfo() for the device side and gettimeofday() for the host side.

Regarding your kernels, yes you have plenty of room for performance improvement, but your strategy may vary depending on what hardware you target.

One obvious path for optimisation would be to tile your processing to cache the input data and improve its re-use and thereafter the computational intensity of your kernels. The typical way of achieving this is by reserving a segment of __local memory and to copy a tile of your input image there using the work-items of each work-groups. The key here is to adjust the dimensions of your tile / work-group so that once the __local segment is populated, all data needed by every individual work-item is already present in the local memory, and doesn't need to be retrieved from the global memory. And since the local memory segment is shared by all the work-items of the work-group, you can adjust the copy method as to maximise its efficiency irrespective of which work-item will use which datum. For a NVIDIA GPU for example, this means that you would make sure that memory accesses are coalesced so that the memory bandwidth usage is maximised. This technique is very common and often used as first optimisation for matrix-matrix multiplication kernels such as here.

Stack Exchange Network

Two-step OpenCL convolution for series of matrices

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Two-step OpenCL convolution for series of matrices

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions