2

I want to implement CDP for a basic forward function (I will call the forward function too many times at the same time (also from a CUDA function) and because of that I want to use CDP)

Here's the code that I'm trying to run;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int outputidx = idx / outputsize;
    int inputidx = idx % outputsize;

    if (outputidx >= outputsize || inputidx >= inputsize) {
        return;
    }

    atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}

__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= size) {
        return;
    }

    result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}

__global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
    NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
    cudaDeviceSynchronize();
    NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}

I also tried to run the function from a device function like this but still gave me the same error;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int outputidx = idx / outputsize;
    int inputidx = idx % outputsize;

    if (outputidx >= outputsize || inputidx >= inputsize) {
        return;
    }

    atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}

__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= size) {
        return;
    }

    result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}

__device__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    
    NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
    NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}

__global__ void NNFeedForwardNormalWrapper(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    NNFeedForwardNormal(values, weigths, result, biases, inputsize, outputsize);
}

And also tried cudaLaunchKernel function and using __global__ instead of __device__ but they didn't work either. I'm using -rdc=true flag too and also my arch is sm_75 which should support CDP.

6
  • 1
    "And also tried cudaLaunchKernel function and using global instead of device but they didn't work either." A device function isn't a kernel function and therefore can not be launched. So the error when using __global__ would be more interesting. Commented Mar 6 at 12:02
  • 1
    This does not look like a good use-case for CDP. CDP is useful when the amount of parallelism is data-dependent. Here you already know the amount of parallelism on the host and can just launch kernels from the host. Commented Mar 6 at 13:46
  • Thank you for your reply @paleonix! I’m trying to build a NEAT algorithm that requires numerous "forward and get result" steps. Since the forward function needs to be called repeatedly, I believe I have to use CDP. The code works fine when I switch from device to global, but now I’m facing an issue with thread synchronization. The process relies on the first function completing before moving on, as it’s essential for obtaining accurate results with the activation function. Is there a way to ensure that a parallelly called function finishes before proceeding with the rest of the code? Commented Mar 6 at 14:38
  • 1
    Sounds like you should read up on CUDA streams. I still don't see how CDP is warranted here. Also keep in mind that legacy CDP1 where you can call cudaDeviceSynchronize() in device code will not run on recent GPUs. It was replaced with CDP2 which does not allow using results from child kernels in the parent kernel. Commented Mar 6 at 14:41
  • "Since the forward function needs to be called repeatedly" Is the number of invocations/repetitions data-dependent? While that could be a reason to use CDP, alternatively one could use the fancy new conditional/while nodes for CUDA Graphs. Commented Mar 6 at 15:02

1 Answer 1

1

The error is exactly what is says on the tin.
If you want to configure a call to a function with <<<x,y,z>>> parameters, it needs to be a __global__ function.

It is perfectly valid to call a __global__ function from another global function.

if you want your code to compile, recode it like so:

#define restrict __restrict__
__global__ void NNFeedForwardNormalMultiple(double* restrict values, double* restrict weights, double* result, int inputsize, int outputsize) {
    ...
}

__global__ void NNFeedForwardNormalActivate(double* restrict biases, double* result, int size) {
   ...
}

__global__ void NNFeedForwardNormal(double* restrict values, double* restrict weights, double* result, double* restrict biases, int inputsize, int outputsize) {
    ...
}

__global__ void NNFeedForwardNormalWrapper(double* restrict values, double* restrict weights, double* result, double* restrict biases, int inputsize, int outputsize) {
    const auto blocks = some calculation;
    const auto threads_per_block = some calculation;
    NNFeedForwardNormal<<<blocks, threads_per_block>>>(values, weights, result, biases, inputsize, outputsize);
}

If instead you want to limit the number of threads in a device function, you can do it like below. Note that you can only limit the number of blocks/threads, you cannot expand it.
As long as you make sure to keep the bounds at warp edges, this will incur no slowdown.

__device__ void first_ten_blocks() { ... }

__device__ void other_blocks() { ... }

__global__ void start() {
    if (blockIdx.x < 10) { first_ten_blocks(); }   
    else { other_blocks(); } 
}

__device__ void first_warp() { ... }
__device__ void other_warps() { ... }
__global__ void warp_split() {
    if (threadIdx.x < 32) { first_warp(); }
    else { other_warps(); }
}

int main() {
    start<<<100,32>>>(); //10 block first_ten_block, 90 blocks other blocks
    start<<<9,32>>>(); //9 blocks first_ten_blocks, no other blocks

    warp_split<<<1, 64>>>(); 1 warp in first_warp, 1 in other_warps
    warp_split<<<10, 32>>>(); 10x warp in first_warp, no other warps
    warp_split<<<48, 512>>>(); 48x first_warp, 48x other warps with 512-32 = 480 threads each.
}
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.