4

I'm trying to figure out how I should create a struct/class destined to be sent to the device, but I keep getting this "invalid argument" CUDA error. I did a small example that shows the error:

#include <iostream>
#include <cstdio>
using namespace std;

#define CUDA_WARN(XXX) \
    do { if (XXX != cudaSuccess) cerr << "CUDA Error: " << \
        cudaGetErrorString(XXX) << ", at line " << __LINE__ \
        << endl; cudaDeviceSynchronize(); } while (0)

struct P {
    double x,y;
    __host__ __device__ void init(const double &a, const double &b) {
        x = a; y = b; }
};

int main(int argc, char **argv)
{
    P hP, hQ, dP;
    cout << "Size of P: " << sizeof(P) << endl;
    CUDA_WARN(cudaMalloc((void**) &dP, sizeof(P)));
    printf("dP: %p\n", &dP); // print dP's address on the device
    hP.init(1.2,-2.1);
    hQ.init(0.,0.);
    CUDA_WARN(cudaMemcpy(&dP, &hP, sizeof(P), cudaMemcpyHostToDevice));
    CUDA_WARN(cudaMemcpy(&hQ, &dP, sizeof(P), cudaMemcpyDeviceToHost));
    cout << "Copy back: " << hQ.x << "\t" << hQ.y << endl;
    dP.init(3.,3.);
    CUDA_WARN(cudaMemcpy(&hP, &dP, sizeof(P), cudaMemcpyDeviceToHost));
    cout << "Copy new:  " << hP.x << "\t" << hP.y << endl;
    return 0;
}

I'm compiling with (my card is a Tesla C2050):

nvcc -arch sm_20 -o exec file.cu

The result I'm getting is:

Size of P: 16
dP: 0x7fff82d4b7b0
CUDA Error: invalid argument, at line 24
CUDA Error: invalid argument, at line 25
Copy back: 0    0
CUDA Error: invalid argument, at line 28
Copy new:  1.2  -2.1


------------------
(program exited with code: 0)
Press return to continue

Thanks guys if you could help me on this!

====== After comments of @talonmies, @JackOLantern, @Robert Crovella =======

Thanks, guys! You really helped! Based on comments, I could correct my code and now it is working. Just to register the final solution:

#include <iostream>
#include <cstdio>
using namespace std;

#define CUDA_WARN(XXX) \
    do { if (XXX != cudaSuccess) cerr << "CUDA Error: " << \
        cudaGetErrorString(XXX) << ", at line " << __LINE__ \
        << endl; cudaDeviceSynchronize(); } while (0)

struct P {
    double x,y;
    __host__ __device__ void init(const double &a, const double &b) {
        x = a; y = b; }
};

/* INCLUDED KERNEL FUNCTION */
__global__ void dev_P_init(P *p, double a, double b) {
    p->init(a,b);
}

int main(int argc, char **argv)
{
    P hP, hQ, *dP; //*changed*
    cout << "Size of P: " << sizeof(P) << endl;
    CUDA_WARN(cudaMalloc((void**) &dP, sizeof(P)));
    printf("dP: %p\n", &dP); // print dP's address on the device
    hP.init(1.2,-2.1);
    hQ.init(0.,0.);
    CUDA_WARN(cudaMemcpy(dP, &hP, sizeof(P), cudaMemcpyHostToDevice)); //*changed*
    CUDA_WARN(cudaMemcpy(&hQ, dP, sizeof(P), cudaMemcpyDeviceToHost)); //*changed*
    cout << "Copy back: " << hQ.x << "\t" << hQ.y << endl;
    dev_P_init<<< 1, 1 >>>(dP,3., 3.); //*call to kernel*
    CUDA_WARN(cudaMemcpy(&hP, dP, sizeof(P), cudaMemcpyDeviceToHost)); //*changed*
    cout << "Copy new:  " << hP.x << "\t" << hP.y << endl;
    return 0;
}

And corrected output:

Size of P: 16
dP: 0x7fff6fa2e498
Copy back: 1.2  -2.1
Copy new:  3    3


------------------
(program exited with code: 0)
Press return to continue
3
  • I believe you need to allocate memory on the gpu and then copy the structure there, you can't just use host stack memory like that Commented Jun 27, 2014 at 20:49
  • 1
    &dp isn't a valid device pointer. That is the source of the error. Commented Jun 27, 2014 at 20:52
  • 1
    The answer given by @JackOLantern is correct. Once you fix the issues pointed out there, your dP.init() (or, dP->init()) will not do what you expect it to do in host code either. I expect it to seg fault. If your desire/intent is to run device code, you must launch a kernel. Commented Jun 27, 2014 at 21:21

1 Answer 1

4

As already noticed by @talonmies, &dP is not a valid device pointer. Indeed, dP is a variable that resides on the host, so its address points to the host memory space. Opposite to that, when dP is a pointer, cudaMalloc will receive its value as a parameter and its value will point to a device memory space.

This is the correct version of your code:

#include <iostream>
#include <cstdio>
using namespace std;

#define CUDA_WARN(XXX) \
    do { if (XXX != cudaSuccess) cerr << "CUDA Error: " << \
    cudaGetErrorString(XXX) << ", at line " << __LINE__ \
    << endl; cudaDeviceSynchronize(); } while (0)

struct P {
    double x,y;
    __host__ __device__ void init(const double &a, const double &b) {
    x = a; y = b; }
};

int main(int argc, char **argv)
{
    P *dP;
    P hP, hQ;
    CUDA_WARN(cudaMalloc((void**) &dP, sizeof(P)));
    CUDA_WARN(cudaMemcpy(dP, &hP, sizeof(P), cudaMemcpyHostToDevice));
    CUDA_WARN(cudaMemcpy(&hQ, dP, sizeof(P), cudaMemcpyDeviceToHost));
    CUDA_WARN(cudaMemcpy(&hP, dP, sizeof(P), cudaMemcpyDeviceToHost));

    return 0;
}
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.