I am new to parallel computing and OpenCL. I followed the book OpenCLProgramming Guide. In the convolution implementation part.
My main.cpp:
#include <iostream>
#include <sstream>
#include <fstream>
#include <string>
#include <OpenCL/OpenCL.h>
using namespace std;
const unsigned int inputSignalWidth = 8;
const unsigned int inputSignalHeight = 8;
cl_uint inputSignal[inputSignalWidth][inputSignalHeight] =
{
{3, 1, 1, 4, 8, 2, 1, 3},
{4, 2, 1, 1, 2, 1, 2, 3},
{4, 4, 4, 4, 3, 2, 2, 2},
{9, 8, 3, 8, 9, 0, 0, 0},
{9, 3, 3, 9, 0, 0, 0, 0},
{0, 9, 0, 8, 0, 0, 0, 0},
{3, 0, 8, 8, 9, 4, 4, 4},
{5, 9, 8 ,1 ,8, 1, 1, 1}
};
const unsigned int outputSignalWidth = 6;
const unsigned int outputSignalHeight = 6;
cl_uint outputSignal[outputSignalWidth][outputSignalHeight];
const unsigned int maskWidth = 3;
const unsigned int maskHeight = 3;
cl_uint mask[maskWidth][maskHeight] =
{
{1, 1, 1}, {1, 0, 1}, {1, 1, 1}
};
inline void checkErr(cl_int err, const char* name)
{
if (err != CL_SUCCESS)
{
cerr << "Error: " << name << endl;
exit(EXIT_FAILURE);
}
}
void CL_CALLBACK contextCallback(const char * errInfo,
const void * private_info,
size_t cb,
void * user_data)
{
cout << "Error occurred during contxt use: " << errInfo << endl;
exit(EXIT_FAILURE);
}
int main(int argc, const char * argv[])
{
cl_int errNum;
cl_uint numPlatforms;
cl_uint numDevices;
cl_platform_id * platformIDs;
cl_device_id * deviceIDs;
cl_context context = NULL;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem inputSignalBuffer;
cl_mem outputSignalBuffer;
cl_mem maskBuffer;
errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs");
platformIDs = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numPlatforms);
errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
checkErr((errNum != CL_SUCCESS)? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatFormIDs");
deviceIDs = NULL;
cl_uint i;
for (i = 0; i < numPlatforms; i++)
{
errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
{
checkErr(errNum, "clGetDeviceIDs");
} else if (numDevices > 0)
{
deviceIDs = (cl_device_id *) alloca(sizeof(cl_device_id) * numDevices);
errNum = clGetDeviceIDs(platformIDs[i], CL_DEVICE_TYPE_GPU, numDevices, &deviceIDs[0], NULL);
checkErr(errNum, "clGetDeviceIDs");
break;
}
}
if (deviceIDs == NULL)
{
cout << "No CPU devices found." << endl;
exit(-1);
}
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties) platformIDs[i], 0
};
context = clCreateContext(contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum);
checkErr(errNum, "clCreateContext");
ifstream srcFile("Convolution.cl");
checkErr(srcFile.is_open()?CL_SUCCESS:-1, "reading Convolution.cl");
string srcProg(istreambuf_iterator<char>(srcFile),
(istreambuf_iterator<char>()));
const char* src = srcProg.c_str();
size_t length = srcProg.length();
program = clCreateProgramWithSource(context, 1, &src, &length, &errNum);
checkErr(errNum, "clCreateProgramWithSource");
cout << "Device count: " << sizeof(deviceIDs)/sizeof(cl_device_id) << endl;
errNum = clBuildProgram(program, numDevices, deviceIDs, NULL, NULL, NULL);
checkErr(errNum, "clBuildProgram");
kernel = clCreateKernel(program, "convolve", &errNum);
checkErr(errNum, "clCreateKernel");
inputSignalBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight*inputSignalWidth, static_cast<void*>(inputSignal), &errNum);
checkErr(errNum, "clCreateBuffer(inputSignal)");
maskBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast<void*>(mask), &errNum);
checkErr(errNum, "clCreateBuffer(mask)");
outputSignalBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum);
checkErr(errNum, "clCreateBuffer(outputSignal)");
queue = clCreateCommandQueue(context, deviceIDs[0], 0, &errNum);
checkErr(errNum, "clCreateCommandQueue");
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth);
errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth);
checkErr(errNum, "clSetKernelArg");
const size_t globalWorkSize[1] =
{
outputSignalWidth * outputSignalWidth
};
const size_t localWorkSize[1] =
{
1
};
clock_t start, end;
clFinish(queue);
start = clock();
errNum = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
clFinish(queue);
end = clock();
cout << "time for calculation: " << (float)(end - start) << endl;
checkErr(errNum, "clEnequeueNDRangeKernel");
errNum = clEnqueueReadBuffer(queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, outputSignal, 0, NULL, NULL);
checkErr(errNum, "clEnqueueReadBuffer");
clFinish(queue);
start = clock();
for (int y = 0; y < outputSignalHeight; y++)
{
for (int x = 0; x < outputSignalHeight; x++)
{
uint sum = 0;
for (int r = 0; r < maskWidth; r++)
{
for (int c =0; c < maskWidth; c++)
{
sum += inputSignal[y+r][x+c]*mask[r][c];
}
}
outputSignal[y][x] = sum;
}
}
end = clock();
cout << "Loop version time: " << (float)(end - start) << endl;
return 0;
}
and Convolution.cl:
__kernel void convolve(const __global uint * const input,
__constant uint * const mask,
__global uint * const output,
const int inputWidth,
const int maskWidth)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
uint sum = 0;
for (int r = 0; r < maskWidth; r++)
{
const int idxIntmp = (y + r) * inputWidth + x;
for (int c =0; c < maskWidth; c++)
{
sum+= mask[r * maskWidth + c] * input[idxIntmp + c];
}
}
output[y * get_global_id(0) + x] = sum;
}
The platform is MacOS 10.9 and AMD 6750M. It doesn't make sense that CL version is much slower than for loop version(around 10x slower). Could you guys help me point out what's wrong with the code?
output[y * get_global_size(0) + x] = sum;instead ofoutput[y * get_global_id(0) + x] = sum;?