I have a loop within which I am launching multiple kernels onto a GPU. Below is the snippet:
for (int idx = start; idx <= end ;idx ++) {
ret = clEnqueueNDRangeKernel(command_queue, memset_kernel, 1, NULL,
&global_item_size_memset, &local_item_size, 0, NULL, NULL);
ASSERT_CL(ret, "Error after launching 1st memset_kernel !");
ret = clEnqueueNDRangeKernel(command_queue, cholesky_kernel, 1, NULL,
&global_item_size_cholesky, &local_item_size, 0, NULL, NULL);
ASSERT_CL(ret, "Error after launching 1st cholesky_kernel !");
ret = clEnqueueNDRangeKernel(command_queue, ckf_kernel1, 1, NULL,
&global_item_size_kernel1, &local_item_size, 0, NULL, NULL);
ASSERT_CL(ret, "Error after launching ckf_kernel1[i] !");
clFinish(command_queue);
ret = clEnqueueNDRangeKernel(command_queue, memset_kernel, 1, NULL,
&global_item_size_memset, &local_item_size, 0, NULL, NULL);
ASSERT_CL(ret, "Error after launching 2nd memset_kernel !");
ret = clEnqueueNDRangeKernel(command_queue, cholesky_kernel, 1, NULL,
&global_item_size_cholesky, &local_item_size, 0, NULL, NULL);
ASSERT_CL(ret, "Error after launching 2nd cholesky_kernel !");
ret = clSetKernelArg(ckf_kernel2, 4, sizeof(idx), (void *)&idx);
ret = clEnqueueNDRangeKernel(command_queue, ckf_kernel2, 1, NULL,
&global_item_size_kernel2, &local_item_size, 0, NULL, NULL);
ASSERT_CL(ret, "Error after launching ckf_kernel2 !");
Now, I am wanting to use this code for a system which has multiple GPUs. So I have completed the following steps:
- created a single context for all the GPUs.
- created one command queue per device.
- created separate kernels for each device (code snippet below assuming two gpus)
allocated separate device buffers for each device
cl_kernel ckf_kernel1[2]; cl_kernel ckf_kernel2[2]; cl_kernel cholesky_kernel[2]; cl_kernel memset_kernel[2]; // read get kernel. ckf_kernel1[0] = clCreateKernel(program, "ckf_kernel1", &ret); ASSERT_CL(ret, "Cannot load ckf_kernel1[i]!"); ckf_kernel2[0] = clCreateKernel(program, "ckf_kernel2", &ret); ASSERT_CL(ret, "Cannot load ckf_kernel2!"); memset_kernel[0] = clCreateKernel(program, "memset_zero", &ret); ASSERT_CL(ret, "Cannot load memset_kernel!"); cholesky_kernel[0] = clCreateKernel(program, "cholesky_kernel", &ret); ASSERT_CL(ret, "Cannot load cholesky_kernel!"); ckf_kernel1[1] = clCreateKernel(program, "ckf_kernel1", &ret); ASSERT_CL(ret, "Cannot load ckf_kernel1[i]!"); ckf_kernel2[1] = clCreateKernel(program, "ckf_kernel2", &ret); ASSERT_CL(ret, "Cannot load ckf_kernel2!"); memset_kernel[1] = clCreateKernel(program, "memset_zero", &ret); ASSERT_CL(ret, "Cannot load memset_kernel!"); cholesky_kernel[1] = clCreateKernel(program, "cholesky_kernel", &ret); ASSERT_CL(ret, "Cannot load cholesky_kernel!");
Now, I am not sure how to launch the kernels onto the different devices within the loop. How to get them to execute in parallel? Please note that there is a clFinish command within the loop above.
Another question: is it standard practice to use multiple threads/processes on the host where each thread/process is responsible for launching kernels on a single GPU?