OpenCL implemented algorithms slower than normal loop

Question

I am new to parallel computing and OpenCL. I followed the book OpenCLProgramming Guide. In the convolution implementation part.

My main.cpp:

#include <iostream>
#include <sstream>
#include <fstream>
#include <string>
#include <OpenCL/OpenCL.h>

using namespace std;

const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;

cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
    {3, 1, 1, 4, 8, 2, 1, 3},
    {4, 2, 1, 1, 2, 1, 2, 3},
    {4, 4, 4, 4, 3, 2, 2, 2},
    {9, 8, 3, 8, 9, 0, 0, 0},
    {9, 3, 3, 9, 0, 0, 0, 0},
    {0, 9, 0, 8, 0, 0, 0, 0},
    {3, 0, 8, 8, 9, 4, 4, 4},
    {5, 9, 8 ,1 ,8, 1, 1, 1}
};

const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;

cl_uint outputSignal[outputSignalWidth][outputSignalHeight];

const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;

cl_uint mask[maskWidth][maskHeight] =
{
    {1, 1, 1}, {1, 0, 1}, {1, 1, 1}
};

inline void checkErr(cl_int err, const char* name)
{
    if (err != CL_SUCCESS)
    {
        cerr << "Error: " << name << endl;
        exit(EXIT_FAILURE);
    }
}

void CL_CALLBACK contextCallback(const char * errInfo,
                                 const void * private_info,
                                 size_t cb,
                                 void * user_data)
{
    cout << "Error occurred during contxt use: " << errInfo << endl;
    exit(EXIT_FAILURE);
}

int main(int argc, const char * argv[])
{
    cl_int errNum;
    cl_uint numPlatforms;
    cl_uint numDevices;
    cl_platform_id * platformIDs;
    cl_device_id * deviceIDs;
    cl_context context = NULL;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem inputSignalBuffer;
    cl_mem outputSignalBuffer;
    cl_mem maskBuffer;

    errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");

    platformIDs = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numPlatforms);
    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatFormIDs");

    deviceIDs = NULL;
    cl_uint i;
    for (i = 0; i < numPlatforms; i++)
    {
        errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
        if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
        {
            checkErr(errNum, "clGetDeviceIDs");
        } else if (numDevices > 0)
        {
            deviceIDs = (cl_device_id *) alloca(sizeof(cl_device_id) * numDevices);
            errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL);
            checkErr(errNum, "clGetDeviceIDs");
            break;
        }
    }

    if (deviceIDs == NULL)
    {
        cout << "No CPU devices found." << endl;
        exit(-1);
    }

    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM, (cl_context_properties) platformIDs[i], 0
    };

    context = clCreateContext(contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum);
    checkErr(errNum, "clCreateContext");


    ifstream srcFile("Convolution.cl");
    checkErr(srcFile.is_open()?CL_SUCCESS:-1, "reading Convolution.cl");

    string srcProg(istreambuf_iterator<char>(srcFile),
                   (istreambuf_iterator<char>()));

    const char* src = srcProg.c_str();
    size_t length = srcProg.length();

    program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);
    checkErr(errNum, "clCreateProgramWithSource");

    cout << "Device count: " << sizeof(deviceIDs)/sizeof(cl_device_id) << endl;

    errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);
    checkErr(errNum, "clBuildProgram");

    kernel = clCreateKernel(program, "convolve", &errNum);
    checkErr(errNum, "clCreateKernel");

    inputSignalBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight*inputSignalWidth, static_cast<void*>(inputSignal), &errNum);
    checkErr(errNum, "clCreateBuffer(inputSignal)");

    maskBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast<void*>(mask), &errNum);
    checkErr(errNum, "clCreateBuffer(mask)");

    outputSignalBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum);
    checkErr(errNum, "clCreateBuffer(outputSignal)");

    queue = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum);
    checkErr(errNum, "clCreateCommandQueue");

    errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
    errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
    errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
    checkErr(errNum, "clSetKernelArg");

    const size_t globalWorkSize[1] =
    {
        outputSignalWidth * outputSignalWidth
    };

    const size_t localWorkSize[1] =
    {
        1
    };

    clock_t start, end;

    clFinish(queue);
    start = clock();
    errNum = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
    clFinish(queue);
    end = clock();
    cout << "time for calculation: " << (float)(end - start) << endl;
    checkErr(errNum, "clEnequeueNDRangeKernel");

    errNum = clEnqueueReadBuffer(queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, outputSignal, 0, NULL, NULL);
    checkErr(errNum, "clEnqueueReadBuffer");

    clFinish(queue);
    start = clock();
    for (int y = 0; y < outputSignalHeight; y++)
    {
        for (int x = 0; x < outputSignalHeight; x++)
        {
            uint sum = 0;
            for (int r = 0; r < maskWidth; r++)
            {
                for (int c =0; c < maskWidth; c++)
                {
                    sum += inputSignal[y+r][x+c]*mask[r][c];
                }
            }
            outputSignal[y][x] = sum;

        }
    }
    end = clock();
    cout << "Loop version time: " << (float)(end - start) << endl;


    return 0;
}

and Convolution.cl:

__kernel void convolve(const __global uint * const input,
                       __constant uint * const mask,
                       __global uint * const output,
                       const int inputWidth,
                       const int maskWidth)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    uint sum = 0;
    for (int r = 0; r < maskWidth; r++)
    {
        const int idxIntmp = (y + r) * inputWidth + x;
        for (int c =0; c < maskWidth; c++)
        {
            sum+= mask[r * maskWidth + c] * input[idxIntmp + c];
        }
    }

    output[y * get_global_id(0) + x] = sum;
}

The platform is MacOS 10.9 and AMD 6750M. It doesn't make sense that CL version is much slower than for loop version(around 10x slower). Could you guys help me point out what's wrong with the code?

OpenCL is not always worthwhile: communication between CPU & GPU has some costs, and if that cost is non-negligible you are sure to lose. Perhaps try with some much bigger data. — Basile Starynkevitch
– Basile Starynkevitch, Commented Sep 27, 2014 at 15:28
Shouldn't this be: output[y * get_global_size(0) + x] = sum; instead of output[y * get_global_id(0) + x] = sum; ? — DarkZeros
– DarkZeros, Commented Sep 29, 2014 at 9:43

user703016 · Accepted Answer · 2014-09-27 16:02:22Z

5

There are two main problems:

const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalWidth };

First, as pointed out in the comments by Basile Starynkevitch, your data set is extremely small. Way too small to benefit from any GPU acceleration. Here, you are running only 36 work items: this is so ridiculously small that it could fit in barely half of a wavefront on one compute unit.

You should be running thousands of work items to correctly harness the power of your GPU. The overhead of OpenCL therefore makes the GPU version slower than the CPU one. Try with a much larger dataset, and you should notice a significant gain in performance.

Additionally:

const size_t localWorkSize[1] = { 1 };

You are running outputSignalWidth * outputSignalWidth work groups of 1 work item each. This is hugely problematic.

On AMD GPUs, the wavefront size is 64. This means that you should be scheduling work groups of at least 64 work items (ideally a multiple of 64) if you want to fully utilize your hardware. You are currently wasting 63 of your 64 hardware threads, that is 98.4% of the GPU doing nothing!

Either adapt your code to use bigger work groups (and change the global work size accordingly), either let the OpenCL driver choose the best size for you, by passing NULL instead of localWorkSize.

In short, you are using a massive backhoe just to move one tiny gravel.

edited Sep 27, 2014 at 16:02

answered Sep 27, 2014 at 14:25

user703016

38.2k10 gold badges91 silver badges113 bronze badges

Sign up to request clarification or add additional context in comments.

3 Comments

user3916770 Over a year ago

Thanks a lot for the answer! The original code is intended to work with CPU devices, and the result is still unreasonably slow. How could I fix it?

user3916770 Over a year ago

Besides, I changed localWorkSize[1] = {64}; and to NULL, but the result is still not correct "time for calculation: 320 Loop version time: 12".

user703016 Over a year ago

I'm very distracted today and missed something obvious. I have edited my answer.

Collectives™ on Stack Overflow

OpenCL implemented algorithms slower than normal loop

1 Answer 1

3 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

3 Comments

Your Answer

Sign up or log in

Post as a guest

Related