0

I'm trying to write a simple array sum between two dynamic arrays using double pointers, both for host and device arrays. The following code can be compiled normally, but at the end the array h_c, containing the result of the sum, is full of zeros. I think that the device arrays are bad allocated and the host arrays aren't transferred properly. Can someone fix the problem? Thanks.

#include <stdio.h>
#include <cuda_runtime.h>
#define N 16
#define BLOCK_DIM 4

__global__ void matrixAdd (int **a, int **b, int **c) {

int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;

if (col < N && row < N) 
c[row][col] = a[row][col] + b[row][col];}

int** create(int row, int col){
  int i,j;
  int** temp;

  temp=(int**) malloc(row*sizeof(int*));
  for(i=0;i<row;i++)
    temp[i]=(int*) malloc(row*sizeof(int));

  for(i=0;i<row;i++)
    for(j=0;j<col;j++)
    temp[i][j]=0;

 return(temp);}

void destroy(int **temp,int rows){
  int i;

  for(i=0;i<rows;i++)
    free(temp[i]);
  free(temp);
 }

int main() {
int i,j;
int** h_a=create(N,N);
int** h_b=create(N,N);
int** h_c=create(N,N);
int **dev_a, **dev_b, **dev_c;
int size = N * N * sizeof(int);

cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);

for(i=0;i<N;i++)  
   for(j=0;j<N;j++)
    h_a[i][j]=5;

for(i=0;i<N;i++)  
   for(j=0;j<N;j++)
    h_b[i][j]=15;  


cudaMemcpy(dev_a, h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, h_b, size, cudaMemcpyHostToDevice);

dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
dim3 dimGrid((int)ceil(N/dimBlock.x),(int)ceil(N/dimBlock.y));

matrixAdd<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c);
cudaMemcpy(h_c,dev_c, size, cudaMemcpyDeviceToHost);

for(i=0;i<N;i++)
{  
   for(j=0;j<N;j++)
    printf("%d ",h_c[i][j]);
   printf("\n");
}

cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);

destroy(h_a,N); destroy(h_b,N); destroy(h_c,N);

return 0;}
3
  • 1
    hover your mouse over the cuda tag. When the pop up appears, click on "info". In the page that opens up, click on "Using Arrays of pointers in CUDA". Start reading. Using dynamically allocated 2D arrays is non-trivial, and you have two errors. 1. You must properly allocate the device array so that the pointers can be chased. 2. Your host array cannot be conveniently allocated with a bunch of malloc operations like you are doing. You need a guaranteed contiguous array on the host, if you dont want to have to use a loop to transfer data from host to device and back. Commented Mar 9, 2017 at 14:52
  • Thank you talonmies. Yes, my question is similar to the one you cited. The following code works! But I'm not able to understand why it is necessary to define h_a and a casting to (void***) for d&a. Can someone explain me it? Thanks. Commented Mar 15, 2017 at 14:06
  • int** aa=create(2,2); aa[0][0]=1; aa[0][1]=2;aa[1][0]=3;aa[1][1]=4; int** h_a = (int**)malloc(2 * sizeof(int*)); for(int i=0; i<2;i++){ cudaMalloc((void**)&h_a[i], 2*sizeof(int)); cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice); } int** d_a; cudaMalloc((void ***)&d_a, 2 * sizeof(int*)); cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice); Commented Mar 15, 2017 at 14:12

0

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.