0

I am trying to learn OpenCL by writing a simple program to add the absolute value of a subtraction of a point's dimensions. When I finished writing the code, the output seemed wrong and so I decided to integrate some printf's in the code and kernel to verify that all the variables are passed correctly to the kernel. By doing this, I learned that the input variables were NOT correctly sent to the kernel, because printing them would return incorrect data (all zeros, to be precise). I have tried changing the data type from uint8 to int, but that did not seem to have any effect. How can I correctly send uint8 variables to the memory buffer in OpenCL? I really cannot seem to identify what I am doing wrong in writing and sending the memory buffers so that they show up incorrectly and would appreciate any opinion, advice or help.

Thank you in advance.

EDIT: Question is now solved. I have updated the code below according to the kind feedback provided in the comment and answer sections. Many thanks!

Code below:

#include <iostream>
#include <chrono>
#include <CL/cl.hpp>
#include <stdio.h>
#include <stdlib.h>

using namespace std;
#define USE_PLATFORM_NR  0

#define SIZE 100*1024*1024UL

//SAD DEFINES
#define NUM_DIM_SAD         5
#define NUM_POINTS_SAD      10
//#define NUM_LOOPS_SAD       20 
#define SAD_SEED            2014
//NUM_LOOPS * NUM_POINTS should be 75M

//SSD DEFINES
#define NUM_DIM_SSD         128
#define NUM_POINTS_SSD      150000
//#define NUM_LOOPS_SSD       1000
#define SSD_SEED            2048
//NUM_LOOPS * NUM_POINTS should be 150M


// Threadblock sizes (e.g. for kernels )
#define TS 5

// =================================================================================================

// Set the kernel as a string
const char* kernelstring =
"__kernel void SAD(const int num_points_sad, const int num_dim_sad,"
"                      const global unsigned char* m1_set,"
"                      const global unsigned char* m2_set,"
"                      global unsigned char* sad_gpu) {"
"    const int Point = get_global_id(0);"
"    unsigned char acc = 0;"
"    printf(\" POINT: %d \\n \", Point); "
"    for (int s=0; s<num_dim_sad ; s++) {"
"        printf(\"GPU: i = %d | m1_set = %d| m2_set = %d \\n \",Point*num_dim_sad + s,m1_set[Point*num_dim_sad+s],m2_set[Point*num_dim_sad+s]);}"
"    for (int k=0; k<num_dim_sad; k++) {"
"        acc += abs( m1_set[Point*num_dim_sad + k] - m2_set[Point*num_dim_sad + k] );"
"    }"
"    printf(\"ACC: %d \\n \",acc);"
"    sad_gpu[Point] = acc;"
"}";


// =================================================================================================

// Matrix-multiplication using a custom OpenCL SGEMM kernel.
int main() {

    cout << "Computing naive SAD & SSD for result checking" << endl;
    //naive implementation on CPU for result checking
    uint8_t* m1_set;// [NUM_POINTS][NUM_DIM];
    uint8_t* m2_set;// [NUM_POINTS][NUM_DIM];

    m1_set = (uint8_t*)malloc(sizeof(uint8_t*) * NUM_POINTS_SAD * NUM_DIM_SAD);
    m2_set = (uint8_t*)malloc(sizeof(uint8_t*) * NUM_POINTS_SAD * NUM_DIM_SAD);

    uint8_t* sad;    //   [NUM_POINTS];
    uint8_t* sad_gpu;//   [NUM_POINTS];
    sad =     (uint8_t*)malloc(sizeof(uint8_t) * NUM_POINTS_SAD);
    sad_gpu = (uint8_t*)malloc(sizeof(uint8_t) * NUM_POINTS_SAD);

    srand(SAD_SEED);
    for (int i = 0; i < NUM_POINTS_SAD * NUM_DIM_SAD; i++)
    {
        sad[i/NUM_DIM_SAD] = 0;
        m1_set[i] = rand() / (uint8_t)RAND_MAX;
        m2_set[i] = rand() / (uint8_t)RAND_MAX;
        cout << "CPU: i = " << i << "| m1_set = " << (unsigned int)m1_set[i] << "| m2_set = " << (unsigned int)m2_set[i] << endl;
    }

    for (int i = 0; i < NUM_POINTS_SAD * NUM_DIM_SAD; i++)
         sad[i/NUM_DIM_SAD] += abs(m1_set[i] - m2_set[i]);

    cl_int err;

    // Configure the OpenCL environment
    printf(">>> Initializing OpenCL...\n");
    cl_platform_id platform = USE_PLATFORM_NR;
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetPlatformId"; return -1;}
    cl_device_id device = 0;
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetDeviceIDs"; return -1; }
    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateContext"; return -1; }
    cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateCommandQueue"; return -1; }
    char deviceName[1024];
    err = clGetDeviceInfo(device, CL_DEVICE_NAME, 1024, deviceName, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetDeviceInfo"; return -1; }
    cl_event event = NULL;

    // Compile the kernel
    cl_program program = clCreateProgramWithSource(context, 1, &kernelstring_sad, NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateProgramWithSource"; return -1; }
    err = clBuildProgram(program, 0, NULL, "", NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clBuildProgram"; return -1; }


    // Check for compilation errors
    size_t logSize;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    if (err != CL_SUCCESS) { cout << err << "clGetProgramBuildInfo"; return -1; }
    char* messages = (char*)malloc((1 + logSize) * sizeof(char));
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetProgramBuildInfo2"; return -1; }
    messages[logSize] = '\0';
    if (logSize > 10) { printf(">>> Compiler message: %s\n", messages); }
    free(messages);


    // Prepare OpenCL memory objects
    cl_mem buf_m1 = clCreateBuffer(context, CL_MEM_READ_ONLY, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_m1"; return -1; }
    cl_mem buf_m2 = clCreateBuffer(context, CL_MEM_READ_ONLY, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_m2"; return -1; }
    cl_mem buf_sad = clCreateBuffer(context, CL_MEM_READ_WRITE, NUM_POINTS_SAD * sizeof(uint8_t), NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_sad"; return -1; }

    // Copy matrices to the GPU
    err = clEnqueueWriteBuffer(queue, buf_m1, CL_TRUE, 0, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), m1_set, 0, NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_m1"; return -1; }
    err = clEnqueueWriteBuffer(queue, buf_m2, CL_TRUE, 0, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), m2_set, 0, NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_m2"; return -1; }
    err = clEnqueueWriteBuffer(queue, buf_sad, CL_TRUE, 0, NUM_POINTS_SAD * sizeof(uint8_t), sad_gpu, 0, NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_sad"; return -1; }

    // Configure the kernel and set its arguments
    int num_points_sad = NUM_POINTS_SAD;
    int num_dim_sad =    NUM_DIM_SAD;
    cl_kernel kernel = clCreateKernel(program, "SAD", &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel"; return -1; }
    err = clSetKernelArg(kernel, 0, sizeof(int), (void*)&num_points_sad);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg0"; return -1; }
    err = clSetKernelArg(kernel, 1, sizeof(int), (void*)&num_dim_sad);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg1"; return -1; }
    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buf_m1);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg2"; return -1; }
    err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buf_m2);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg3"; return -1; }
    err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&buf_sad);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel4"; return -1; }

    // Start the timed loop
    printf(">>> Starting SAD GPU run...\n");
    std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();

 //   const size_t local[1] = { TS };
    const size_t global[1] = { NUM_POINTS_SAD };
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, &event); //local
    if (err != CL_SUCCESS) { cout << err << "clEnqueueNDRangeKernel"; return -1; }
    // Wait for calculations to be finished
    clWaitForEvents(1, &event);

    // End the timed loop
    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

    // Copy the output matrix C back to the CPU memory
    clEnqueueReadBuffer(queue, buf_sad, CL_TRUE, 0, NUM_POINTS_SAD * sizeof(uint8_t), sad_gpu, 0, NULL, NULL);
    auto us = std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count();
    std::cout << "Time difference = " << us << " us " << std::endl;
    // Free the OpenCL memory objects
    clReleaseMemObject(buf_m1);
    clReleaseMemObject(buf_m2);
    clReleaseMemObject(buf_sad);

    // Clean-up OpenCL 
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    clReleaseProgram(program);
    clReleaseKernel(kernel);

    for (int i = 0; i < NUM_POINTS_SAD; i++)
    {
        cout << "i: " << i;
        cout << " | CPU: " << (unsigned int)sad[i];
        cout << " | GPU: " << (unsigned int)sad_gpu[i];
        cout << endl;
    }
    // Free the host memory objects
    free(m1_set);
    free(m2_set);
    free(sad);
    free(sad_gpu);

    // Exit
    return 0;
}
7
  • Check return codes of cl* functions, the answer is very likely there. Commented May 1, 2020 at 15:18
  • @doqtor Thank you for your response. I have checked the return codes of all cl* functions used before clEnqueueNDRangeKernel and they all return 0. Commented May 1, 2020 at 18:31
  • Can you show us how you do it? You can update your question with the fixed code. Commented May 2, 2020 at 5:46
  • @doqtor I have updated the code above. Checking the output stream, no "ERROR!" has been printed. Commented May 2, 2020 at 9:10
  • Why do you use malloc in c++ code? Why not to use std::vector for arrays? Commented May 2, 2020 at 9:30

1 Answer 1

1

There is an error in function where the context is being created - one of the parameters is being passed at wrong position.

Instead:

cl_context context = clCreateContext(NULL, 1, &device, NULL, &err, NULL);

Should be:

cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
                                                             ^^^^^^^^^^

Also the way the error are output is still not much helpful. Should be something like this:

cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err != CL_SUCCESS)
{
    cout << err << "clCreateContext";
    return -1;
}

This way we stop the code execution when the error occurred and we know for which function it happened.

======= UPDATE ========================================================

There is wrong type being used in kernel: uint8 type in OpenCL is a vector type meaning array of 8 values of type int.

To fix the problem use uchar/unsigned char type in the OpenCL kernel which is an equivalent of uint8_t/unsigned char from c++.

See OpenCL data types and Scalar data types.

Sign up to request clarification or add additional context in comments.

2 Comments

once again, thank you for your suggestion. I changed the way the errors are outputted as per your suggerstion and I have fixed the position of the err parameter, thus also updating the code above. Unfortunately, I still do not receive any OpenCL errors and, as a result, the code continues executing until the end, displaying wrong results. (if the console output helps, it is the same as before, I shall leave it here link )
Yes, you were right! The updated answer solved my problem! I have marked the question as solved (if I have done it incorrectly please let me know to fix it). Once again, thank you very much for taking the time to answer to my question and for providing continous feedback and I wish you all the best!

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.