I'm trying to launch a kernel in the attached code. I'm getting the massage "kernel launched failed:invalid argument".
// System includes
#include <stdio.h>
#include <assert.h>
// CUDA runtime
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line )
{
if(cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}
static const int MAX_FILTER_WIDTH = 7;
char *image_filename = "lena_bw_big.pgm";
char *out_filename = "lena_bw.out.pgm";
char *results_filename = "results.log";
// Loads filter configuration parameters from the command line
void load_filter(int argc, char** argv, int* filt_width, float* factor, float* bias, float* coefs, bool* use_shared)
{
//forward declaration of a function that is being used here
void parse_coefs(const char* coefs_txt, int radius, float* coefs);
char* coefs_txt;
if (argv==NULL || filt_width==NULL || factor==NULL || bias==NULL || coefs==NULL)
{
printf("Error: Bad params to load_coefs\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "filter_width"))
{
*filt_width = getCmdLineArgumentInt(argc, (const char **)argv, "filter_width");
if (*filt_width < 1 || *filt_width > MAX_FILTER_WIDTH || (*filt_width % 2) != 1)
{
printf("Error: Invalid filter width (%d)\n",*filt_width);
exit(-1);
}
}
else
{
printf("Error: Filter width is not specified\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "bias"))
*bias = getCmdLineArgumentFloat(argc, (const char **)argv, "bias");
else
{
printf("Error: Bias is not specified\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "factor"))
*factor = getCmdLineArgumentFloat(argc, (const char **)argv, "factor");
else
{
printf("Error: Factor is not specified\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "coefs"))
getCmdLineArgumentString(argc, (const char **)argv, "coefs",&coefs_txt);
parse_coefs(coefs_txt,*filt_width,coefs);
if (checkCmdLineFlag(argc, (const char **)argv, "shared"))
*use_shared = true;
else
*use_shared = false;
}
// Parse filter coefficients from string. The number of coefficients should be radius*radius.
void parse_coefs(const char* coefs_txt, int filt_width, float* coefs)
{
const char* ptxt = coefs_txt;
int skip_chars;
memset(coefs,0,MAX_FILTER_WIDTH*MAX_FILTER_WIDTH*sizeof(float));
for (int i = filt_width - 1; i >= 0; i--)
{
for (int j = filt_width - 1; j >= 0; j--)
{
if (sscanf(ptxt,"%f%n", &coefs[i*MAX_FILTER_WIDTH+j], &skip_chars) != 1)
{
printf("Error: Not enough coefficients. Read %d/%d coefficients.\n",i*filt_width+j,filt_width*filt_width);
exit(-1);
}
ptxt += skip_chars+1;
}
}
}
__global__ void convolution2D_kernel(
unsigned char* inputImage,
unsigned char* outputImage,
float* filter,
int imageWidth,
int imageHeight,
int imagePitch,
int filterWidth,
float hfactor,
float hbias
)
{/*
int idx=blockDim.x*blockIdx.x+threadIdx.x;
int idy=blockDim.y*blockIdx.y+threadIdx.y;
if(0<idx<imageWidth && 0<idy<imageHeight){
float sum = 0.f;
//multiply every value of the filter with corresponding image pixel
for(int filterX = 0; filterX < filterWidth; filterX++)
for(int filterY = 0; filterY < filterWidth; filterY++)
{
int imageX = idx - filterWidth / 2 + filterX;
int imageY = idy - filterWidth / 2 + filterY;
if (imageX >=0 && imageX < imageWidth && imageY >=0 && imageY < imageHeight) {
sum += inputImage[imageX+imageWidth*imageY] * filter[filterX + filterY*filterWidth];
}
//sum*=hfactor;
//sum+=hbias;
//sum=
//truncate values smaller than zero and larger than 255
outputImage[idx+imageWidth*idy] = fminf(fmaxf(int(hfactor * sum + hbias), 0), 255);
}
}*/
}
__global__ void convolution2DShared_kernel(
unsigned char* inputImage,
unsigned char* outputImage,
int imageWidth,
int imageHeight,
int imagePitch,
int filterWidth
)
{
}
void convolution2D(unsigned char* input_img, unsigned char* output_img, float* hfilter, int width, int height,
int hfilt_width, float hfactor, float hbias, float* hcoefs, bool use_shared)
{
// Allocate device memory
unsigned char *d_in=NULL, *d_out=NULL;
float *d_filter=NULL;
int imgSize=sizeof(float)*width*height;
int filterSize=sizeof(float)*hfilt_width*hfilt_width;
int blockWidth=32;
int gridx=width/blockWidth;
if(width%blockWidth!=0)
gridx++;
printf("gridx size is %d\n",gridx);
int gridy=height/blockWidth;
if(height%blockWidth!=0)
gridy++;
printf("gridy size is %d\n",gridy);
printf("blockWidth size is %d\n",blockWidth);
// measure execution time
cudaEvent_t start,stop;
const int iters = 10;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cudaEventRecord(start, NULL);
printf("allocating mem\n");
cudaMalloc((void **) d_in, imgSize);
cudaMalloc((void **) d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);
cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);
cudaMemcpy(d_filter,hfilter,filterSize,cudaMemcpyHostToDevice);
// Setup execution parameters
dim3 threads(blockWidth, blockWidth);
dim3 grid(gridx,gridy);
printf("kernel starts\n");
// calculate execution time average over iters iterations
for (int i=0; i<iters; i++)
{
if (!use_shared)
convolution2D_kernel<<<grid,threads>>>(d_in, d_out, d_filter, width, height, width, hfilt_width, hfactor, hbias);
else
convolution2DShared_kernel<<<grid,threads>>>(d_in, d_out, width, height, width, hfilt_width);
}
checkCudaErrors(cudaEventRecord(stop, NULL));
checkCudaErrors(cudaEventSynchronize(stop));
// check for errors during kernel launch
cudaError_t err;
if ((err = cudaGetLastError()) != cudaSuccess)
{
printf("Kernel launch failed: %s",cudaGetErrorString(err));
exit(1);
}
float msec = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msec, start, stop));
printf("Applying %dx%d filter on image of size %dx%d %s using shared memory took %f ms\n",
hfilt_width,hfilt_width,width,height,(use_shared?"with":"without"),msec/iters);
// write results to results file
unsigned long long result_values[] = {hfilt_width,hfilt_width,width,height,use_shared,msec/iters*1000};
if (true != sdkWriteFile(results_filename,result_values,6,0,false,true))
{
printf("Error: Writing results file failed.");
exit(1);
}
cudaFree(d_in);
cudaFree(d_out);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void convolution_cpu(unsigned char* input_img, unsigned char* output_img, int width, int height,
int hfilt_width, float hfactor, float hbias, float* hcoefs)
{
for(int x = 0; x < width; x++)
for(int y = 0; y < height; y++)
{
float sum = 0.f;
//multiply every value of the filter with corresponding image pixel
for(int filterX = 0; filterX < hfilt_width; filterX++)
for(int filterY = 0; filterY < hfilt_width; filterY++)
{
int imageX = x - hfilt_width / 2 + filterX;
int imageY = y - hfilt_width / 2 + filterY;
if (imageX >=0 && imageX < width && imageY >=0 && imageY < height) {
sum += input_img[imageX+width*imageY] * hcoefs[filterX + filterY*MAX_FILTER_WIDTH];
}
}
//truncate values smaller than zero and larger than 255
output_img[x+width*y] = std::min(std::max(int(hfactor * sum + hbias), 0), 255);
}
}
/**
* Program main
*/
int main(int argc, char **argv)
{
unsigned char* h_inimg = NULL;
unsigned char* h_outimg = NULL;
unsigned char* h_refimg = NULL;
unsigned int width, height;
int hfilt_width = -1;
float hfactor = 1.f, hbias = 0.f;
float hcoefs[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
bool use_shared = false;
// load parameters of filter
if (argc > 1)
load_filter(argc,argv,&hfilt_width,&hfactor,&hbias,hcoefs,&use_shared);
else {
hfilt_width = 5;
hfactor = 1.0f / 13.0f;
hbias = 0.0f;
parse_coefs(
"0,0,1,0,0,"
"0,1,1,1,0,"
"1,1,1,1,1,"
"0,1,1,1,0,"
"0,0,1,0,0,",
hfilt_width,hcoefs);
}
char* image_path = sdkFindFilePath(image_filename, argv[0]);
if (image_path == NULL) {
printf("Unable to source image file: %s\n", image_filename);
exit(-1);
}
// Load image from disk
sdkLoadPGM(image_path, &h_inimg, &width, &height);
h_outimg = (unsigned char*)malloc(width * height);
printf("Starting convolution\n");
convolution2D(h_inimg,h_outimg,hcoefs,width,height,hfilt_width,hfactor,hbias,hcoefs,use_shared);
printf("Validating...\n");
h_refimg = (unsigned char*)malloc(width * height);
convolution_cpu(h_inimg,h_refimg,width,height,hfilt_width,hfactor,hbias,hcoefs);
int err_cnt = 0;
for (int r=0; r<height; r++)
for (int c=0; c<width; c++)
if (h_outimg[c+r*width]!=h_refimg[c+r*width])
{
++err_cnt;
printf("Err %2d: [%d,%d] GPU %d | CPU %d\n",err_cnt,r,c,h_outimg[c+r*width],h_refimg[c+r*width]);
if(err_cnt > 4)
{
printf("Terminating...\n");
exit(1);
}
}
if (0 == err_cnt)
printf("OK\n");
// Save image
sdkSavePGM(out_filename,h_outimg,width,height);
free(h_inimg);
free(h_outimg);
}
if i put line 191 into comments everything runs fine and dandy (with no data in kernel).
can anyone please point the proper way to deliver the data into the kernel?
dim3 threads(blockw, blockw);, butblockwisn't defined anywhere. Furthermore, your posted code is messed up and has a bunch of\tin it, it won't compile.