I'm having a bit of a issue understanding how I could implement CUDA shared memory since I'm not using the thread number for anything other than to check which calculations that certain thread should do.
__global__ void gpu_histogram_equalization(unsigned char * img_out, unsigned char * img_in,
int * hist_in, int img_size, int nbr_bin, int numOfThreads, int * lut){
int i = 0;
int x = threadIdx.x + blockDim.x*blockIdx.x;
int start;
int end;
/* Get the result image */
if(x >= img_size) {
return;
}
start = ((img_size/numOfThreads) * x);
if(numOfThreads == 1) {
end = (img_size/numOfThreads);
}
else {
end = ((img_size/numOfThreads) * (x+1));
}
for(i = start; i < end; i ++){
if(lut[img_in[i]] > 255){
img_out[i] = 255;
}
else{
img_out[i] = (unsigned char)lut[img_in[i]];
}
}
}
Can anyone clarify that my speculation is true, that this is not possible to make use of shared memory?