OpenCL Memory Buffer not passing the right values to kernel

Question

I am trying to learn OpenCL by writing a simple program to add the absolute value of a subtraction of a point's dimensions. When I finished writing the code, the output seemed wrong and so I decided to integrate some printf's in the code and kernel to verify that all the variables are passed correctly to the kernel. By doing this, I learned that the input variables were NOT correctly sent to the kernel, because printing them would return incorrect data (all zeros, to be precise). I have tried changing the data type from uint8 to int, but that did not seem to have any effect. How can I correctly send uint8 variables to the memory buffer in OpenCL? I really cannot seem to identify what I am doing wrong in writing and sending the memory buffers so that they show up incorrectly and would appreciate any opinion, advice or help.

Thank you in advance.

EDIT: Question is now solved. I have updated the code below according to the kind feedback provided in the comment and answer sections. Many thanks!

Code below:

#include <iostream>
#include <chrono>
#include <CL/cl.hpp>
#include <stdio.h>
#include <stdlib.h>

using namespace std;
#define USE_PLATFORM_NR  0

#define SIZE 100*1024*1024UL

//SAD DEFINES
#define NUM_DIM_SAD         5
#define NUM_POINTS_SAD      10
//#define NUM_LOOPS_SAD       20 
#define SAD_SEED            2014
//NUM_LOOPS * NUM_POINTS should be 75M

//SSD DEFINES
#define NUM_DIM_SSD         128
#define NUM_POINTS_SSD      150000
//#define NUM_LOOPS_SSD       1000
#define SSD_SEED            2048
//NUM_LOOPS * NUM_POINTS should be 150M


// Threadblock sizes (e.g. for kernels )
#define TS 5

// =================================================================================================

// Set the kernel as a string
const char* kernelstring =
"__kernel void SAD(const int num_points_sad, const int num_dim_sad,"
"                      const global unsigned char* m1_set,"
"                      const global unsigned char* m2_set,"
"                      global unsigned char* sad_gpu) {"
"    const int Point = get_global_id(0);"
"    unsigned char acc = 0;"
"    printf(\" POINT: %d \\n \", Point); "
"    for (int s=0; s<num_dim_sad ; s++) {"
"        printf(\"GPU: i = %d | m1_set = %d| m2_set = %d \\n \",Point*num_dim_sad + s,m1_set[Point*num_dim_sad+s],m2_set[Point*num_dim_sad+s]);}"
"    for (int k=0; k<num_dim_sad; k++) {"
"        acc += abs( m1_set[Point*num_dim_sad + k] - m2_set[Point*num_dim_sad + k] );"
"    }"
"    printf(\"ACC: %d \\n \",acc);"
"    sad_gpu[Point] = acc;"
"}";


// =================================================================================================

// Matrix-multiplication using a custom OpenCL SGEMM kernel.
int main() {

    cout << "Computing naive SAD & SSD for result checking" << endl;
    //naive implementation on CPU for result checking
    uint8_t* m1_set;// [NUM_POINTS][NUM_DIM];
    uint8_t* m2_set;// [NUM_POINTS][NUM_DIM];

    m1_set = (uint8_t*)malloc(sizeof(uint8_t*) * NUM_POINTS_SAD * NUM_DIM_SAD);
    m2_set = (uint8_t*)malloc(sizeof(uint8_t*) * NUM_POINTS_SAD * NUM_DIM_SAD);

    uint8_t* sad;    //   [NUM_POINTS];
    uint8_t* sad_gpu;//   [NUM_POINTS];
    sad =     (uint8_t*)malloc(sizeof(uint8_t) * NUM_POINTS_SAD);
    sad_gpu = (uint8_t*)malloc(sizeof(uint8_t) * NUM_POINTS_SAD);

    srand(SAD_SEED);
    for (int i = 0; i < NUM_POINTS_SAD * NUM_DIM_SAD; i++)
    {
        sad[i/NUM_DIM_SAD] = 0;
        m1_set[i] = rand() / (uint8_t)RAND_MAX;
        m2_set[i] = rand() / (uint8_t)RAND_MAX;
        cout << "CPU: i = " << i << "| m1_set = " << (unsigned int)m1_set[i] << "| m2_set = " << (unsigned int)m2_set[i] << endl;
    }

    for (int i = 0; i < NUM_POINTS_SAD * NUM_DIM_SAD; i++)
         sad[i/NUM_DIM_SAD] += abs(m1_set[i] - m2_set[i]);

    cl_int err;

    // Configure the OpenCL environment
    printf(">>> Initializing OpenCL...\n");
    cl_platform_id platform = USE_PLATFORM_NR;
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetPlatformId"; return -1;}
    cl_device_id device = 0;
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetDeviceIDs"; return -1; }
    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateContext"; return -1; }
    cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateCommandQueue"; return -1; }
    char deviceName[1024];
    err = clGetDeviceInfo(device, CL_DEVICE_NAME, 1024, deviceName, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetDeviceInfo"; return -1; }
    cl_event event = NULL;

    // Compile the kernel
    cl_program program = clCreateProgramWithSource(context, 1, &kernelstring_sad, NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateProgramWithSource"; return -1; }
    err = clBuildProgram(program, 0, NULL, "", NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clBuildProgram"; return -1; }


    // Check for compilation errors
    size_t logSize;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    if (err != CL_SUCCESS) { cout << err << "clGetProgramBuildInfo"; return -1; }
    char* messages = (char*)malloc((1 + logSize) * sizeof(char));
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL);
    if (err != CL_SUCCESS) { cout << err << "clGetProgramBuildInfo2"; return -1; }
    messages[logSize] = '\0';
    if (logSize > 10) { printf(">>> Compiler message: %s\n", messages); }
    free(messages);


    // Prepare OpenCL memory objects
    cl_mem buf_m1 = clCreateBuffer(context, CL_MEM_READ_ONLY, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_m1"; return -1; }
    cl_mem buf_m2 = clCreateBuffer(context, CL_MEM_READ_ONLY, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), NULL, &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_m2"; return -1; }
    cl_mem buf_sad = clCreateBuffer(context, CL_MEM_READ_WRITE, NUM_POINTS_SAD * sizeof(uint8_t), NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_sad"; return -1; }

    // Copy matrices to the GPU
    err = clEnqueueWriteBuffer(queue, buf_m1, CL_TRUE, 0, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), m1_set, 0, NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_m1"; return -1; }
    err = clEnqueueWriteBuffer(queue, buf_m2, CL_TRUE, 0, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), m2_set, 0, NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_m2"; return -1; }
    err = clEnqueueWriteBuffer(queue, buf_sad, CL_TRUE, 0, NUM_POINTS_SAD * sizeof(uint8_t), sad_gpu, 0, NULL, NULL);
    if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_sad"; return -1; }

    // Configure the kernel and set its arguments
    int num_points_sad = NUM_POINTS_SAD;
    int num_dim_sad =    NUM_DIM_SAD;
    cl_kernel kernel = clCreateKernel(program, "SAD", &err);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel"; return -1; }
    err = clSetKernelArg(kernel, 0, sizeof(int), (void*)&num_points_sad);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg0"; return -1; }
    err = clSetKernelArg(kernel, 1, sizeof(int), (void*)&num_dim_sad);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg1"; return -1; }
    err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buf_m1);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg2"; return -1; }
    err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buf_m2);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg3"; return -1; }
    err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&buf_sad);
    if (err != CL_SUCCESS) { cout << err << "clCreateKernel4"; return -1; }

    // Start the timed loop
    printf(">>> Starting SAD GPU run...\n");
    std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();

 //   const size_t local[1] = { TS };
    const size_t global[1] = { NUM_POINTS_SAD };
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, &event); //local
    if (err != CL_SUCCESS) { cout << err << "clEnqueueNDRangeKernel"; return -1; }
    // Wait for calculations to be finished
    clWaitForEvents(1, &event);

    // End the timed loop
    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

    // Copy the output matrix C back to the CPU memory
    clEnqueueReadBuffer(queue, buf_sad, CL_TRUE, 0, NUM_POINTS_SAD * sizeof(uint8_t), sad_gpu, 0, NULL, NULL);
    auto us = std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count();
    std::cout << "Time difference = " << us << " us " << std::endl;
    // Free the OpenCL memory objects
    clReleaseMemObject(buf_m1);
    clReleaseMemObject(buf_m2);
    clReleaseMemObject(buf_sad);

    // Clean-up OpenCL 
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    clReleaseProgram(program);
    clReleaseKernel(kernel);

    for (int i = 0; i < NUM_POINTS_SAD; i++)
    {
        cout << "i: " << i;
        cout << " | CPU: " << (unsigned int)sad[i];
        cout << " | GPU: " << (unsigned int)sad_gpu[i];
        cout << endl;
    }
    // Free the host memory objects
    free(m1_set);
    free(m2_set);
    free(sad);
    free(sad_gpu);

    // Exit
    return 0;
}

Check return codes of cl* functions, the answer is very likely there. — doqtor
– doqtor, Commented May 1, 2020 at 15:18
@doqtor Thank you for your response. I have checked the return codes of all cl* functions used before clEnqueueNDRangeKernel and they all return 0. — George-Cristian Costache
– George-Cristian Costache, Commented May 1, 2020 at 18:31
Can you show us how you do it? You can update your question with the fixed code. — doqtor
– doqtor, Commented May 2, 2020 at 5:46
@doqtor I have updated the code above. Checking the output stream, no "ERROR!" has been printed. — George-Cristian Costache
– George-Cristian Costache, Commented May 2, 2020 at 9:10
Why do you use malloc in c++ code? Why not to use std::vector for arrays? — doqtor
– doqtor, Commented May 2, 2020 at 9:30

doqtor · Accepted Answer · 2020-05-02 11:36:32Z

1

There is an error in function where the context is being created - one of the parameters is being passed at wrong position.

Instead:

cl_context context = clCreateContext(NULL, 1, &device, NULL, &err, NULL);

Should be:

cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
                                                             ^^^^^^^^^^

Also the way the error are output is still not much helpful. Should be something like this:

cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err != CL_SUCCESS)
{
    cout << err << "clCreateContext";
    return -1;
}

This way we stop the code execution when the error occurred and we know for which function it happened.

======= UPDATE ========================================================

There is wrong type being used in kernel: uint8 type in OpenCL is a vector type meaning array of 8 values of type int.

To fix the problem use uchar/unsigned char type in the OpenCL kernel which is an equivalent of uint8_t/unsigned char from c++.

See OpenCL data types and Scalar data types.

edited May 2, 2020 at 11:36

answered May 2, 2020 at 10:42

doqtor

8,4942 gold badges22 silver badges36 bronze badges

Sign up to request clarification or add additional context in comments.

2 Comments

George-Cristian Costache Over a year ago

once again, thank you for your suggestion. I changed the way the errors are outputted as per your suggerstion and I have fixed the position of the err parameter, thus also updating the code above. Unfortunately, I still do not receive any OpenCL errors and, as a result, the code continues executing until the end, displaying wrong results. (if the console output helps, it is the same as before, I shall leave it here link )

George-Cristian Costache Over a year ago

Yes, you were right! The updated answer solved my problem! I have marked the question as solved (if I have done it incorrectly please let me know to fix it). Once again, thank you very much for taking the time to answer to my question and for providing continous feedback and I wish you all the best!

Collectives™ on Stack Overflow

OpenCL Memory Buffer not passing the right values to kernel

1 Answer 1

2 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

2 Comments

Your Answer

Sign up or log in

Post as a guest

Related