Create 2D Array with CUDA

Question

in cuda c programming guide document there is a sample that show a 2d array:

// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}

int main()
{
...
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
...
}

i use 2d array with below form and works correctly:

dim3 grid[COLUMNS][ROWS];
kernel_Matrix<<<grid,1>>>(dev_strA, dev_strB, dev_Matrix);

__global__ void add(int *a, int *b, int *c)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
}

there is a way that implement 2d array with [ ][ ] definition? i tested this way but not works.

Maybe I misunderstood your question. What are you trying to do? Pass 2D array to your kernel function or define 2D grid? — stuhlo
– stuhlo, Commented Apr 6, 2013 at 19:54
@stuhlo: i want pass 2d array to kernel and access it in global void function with [ ] [ ] definition. — mahdimb
– mahdimb, Commented Apr 6, 2013 at 20:11
You need to fix your dim3 grid definition as stuhlo indicated. Then you may want to look at this question for some ideas. Usually flattening a 2D array is easiest, but if you have a fixed size array, you can use the approach in the first example I gave in my answer to that question. — Robert Crovella
– Robert Crovella, Commented Apr 6, 2013 at 20:35
I added host code that allocates memory on device, copies data from host to device memory, launches kernel and finally copies data from device to host memory. Don't forget to provide CUDA calls with error checking. — stuhlo
– stuhlo, Commented Apr 7, 2013 at 11:03

stuhlo · Accepted Answer · 2013-04-07 02:54:30Z

dim3 is not array but structure defined in CUDA header file (vector_types.h). This structure is used to specify dimensions of GRID in execution configuration of global functions, i.e. in <<< >>>. It doesn't keep the 'real' blocks it just configures a number of blocks that will be executed.

The only two ways (to my knowledge) to initialize this structure are:
1. dim3 grid(x, y, z);
2. dim3 grid = {x, y, z};

EDIT: Host code with dim3 initialization and with passing the arrays to kernel function in a way you will be able to access its elements via [][]:

float A[N][N];
float B[N][N];
float C[N][N];

float (*d_A)[N]; //pointers to arrays of dimension N
float (*d_B)[N];
float (*d_C)[N];

for(int i = 0; i < N; i++) {
    for(int j = 0; j < N; j++) {
        A[i][j] = i;
        B[i][j] = j;
    }
}       

//allocation
cudaMalloc((void**)&d_A, (N*N)*sizeof(float));
cudaMalloc((void**)&d_B, (N*N)*sizeof(float));
cudaMalloc((void**)&d_C, (N*N)*sizeof(float));

//copying from host to device
cudaMemcpy(d_A, A, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, (N*N)*sizeof(float), cudaMemcpyHostToDevice);

// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);

//copying from device to host
cudaMemcpy(A, (d_A), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(B, (d_B), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(C, (d_C), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);

Collectives™ on Stack Overflow

Create 2D Array with CUDA

1 Answer 1

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related