I am trying to pass an char array containing 10000 words read from a txt file in the main function to CUDA kernel function.
The words are transferred from the host to device like this:
(main function code:)
//.....
const int text_length = 20;
char (*wordList)[text_length] = new char[10000][text_length];
char *dev_wordList;
for(int i=0; i<number_of_words; i++)
{
file>>wordList[i];
cout<<wordList[i]<<endl;
}
cudaMalloc((void**)&dev_wordList, 20*number_of_words*sizeof(char));
cudaMemcpy(dev_wordList, &(wordList[0][0]), 20 * number_of_words * sizeof(char), cudaMemcpyHostToDevice);
//Setup execution parameters
int n_blocks = (number_of_words + 255)/256;
int threads_per_block = 256;
dim3 grid(n_blocks, 1, 1);
dim3 threads(threads_per_block, 1, 1);
cudaPrintfInit();
testKernel<<<grid, threads>>>(dev_wordList);
cudaDeviceSynchronize();
cudaPrintfDisplay(stdout,true);
cudaPrintfEnd();
(kernel function code:)
__global__ void testKernel(char* d_wordList)
{
//access thread id
const unsigned int bid = blockIdx.x;
const unsigned int tid = threadIdx.x;
const unsigned int index = bid * blockDim.x + tid;
cuPrintf("!! %c%c%c%c%c%c%c%c%c%c \n" , d_wordList[index * 20 + 0],
d_wordList[index * 20 + 1],
d_wordList[index * 20 + 2],
d_wordList[index * 20 + 3],
d_wordList[index * 20 + 4],
d_wordList[index * 20 + 5],
d_wordList[index * 20 + 6],
d_wordList[index * 20 + 7],
d_wordList[index * 20 + 8],
d_wordList[index * 20 + 9]);
}
Is there a way to manipulate them easier? (I would like to have a word per element/position) I tried with <string>, but I can't use them in CUDA device code.