CUDA invalid argument when trying to copy struct to device's memory (cudaMemcpy)

Question

I'm trying to figure out how I should create a struct/class destined to be sent to the device, but I keep getting this "invalid argument" CUDA error. I did a small example that shows the error:

#include <iostream>
#include <cstdio>
using namespace std;

#define CUDA_WARN(XXX) \
    do { if (XXX != cudaSuccess) cerr << "CUDA Error: " << \
        cudaGetErrorString(XXX) << ", at line " << __LINE__ \
        << endl; cudaDeviceSynchronize(); } while (0)

struct P {
    double x,y;
    __host__ __device__ void init(const double &a, const double &b) {
        x = a; y = b; }
};

int main(int argc, char **argv)
{
    P hP, hQ, dP;
    cout << "Size of P: " << sizeof(P) << endl;
    CUDA_WARN(cudaMalloc((void**) &dP, sizeof(P)));
    printf("dP: %p\n", &dP); // print dP's address on the device
    hP.init(1.2,-2.1);
    hQ.init(0.,0.);
    CUDA_WARN(cudaMemcpy(&dP, &hP, sizeof(P), cudaMemcpyHostToDevice));
    CUDA_WARN(cudaMemcpy(&hQ, &dP, sizeof(P), cudaMemcpyDeviceToHost));
    cout << "Copy back: " << hQ.x << "\t" << hQ.y << endl;
    dP.init(3.,3.);
    CUDA_WARN(cudaMemcpy(&hP, &dP, sizeof(P), cudaMemcpyDeviceToHost));
    cout << "Copy new:  " << hP.x << "\t" << hP.y << endl;
    return 0;
}

I'm compiling with (my card is a Tesla C2050):

nvcc -arch sm_20 -o exec file.cu

The result I'm getting is:

Size of P: 16
dP: 0x7fff82d4b7b0
CUDA Error: invalid argument, at line 24
CUDA Error: invalid argument, at line 25
Copy back: 0    0
CUDA Error: invalid argument, at line 28
Copy new:  1.2  -2.1


------------------
(program exited with code: 0)
Press return to continue

Thanks guys if you could help me on this!

====== After comments of @talonmies, @JackOLantern, @Robert Crovella =======

Thanks, guys! You really helped! Based on comments, I could correct my code and now it is working. Just to register the final solution:

#include <iostream>
#include <cstdio>
using namespace std;

#define CUDA_WARN(XXX) \
    do { if (XXX != cudaSuccess) cerr << "CUDA Error: " << \
        cudaGetErrorString(XXX) << ", at line " << __LINE__ \
        << endl; cudaDeviceSynchronize(); } while (0)

struct P {
    double x,y;
    __host__ __device__ void init(const double &a, const double &b) {
        x = a; y = b; }
};

/* INCLUDED KERNEL FUNCTION */
__global__ void dev_P_init(P *p, double a, double b) {
    p->init(a,b);
}

int main(int argc, char **argv)
{
    P hP, hQ, *dP; //*changed*
    cout << "Size of P: " << sizeof(P) << endl;
    CUDA_WARN(cudaMalloc((void**) &dP, sizeof(P)));
    printf("dP: %p\n", &dP); // print dP's address on the device
    hP.init(1.2,-2.1);
    hQ.init(0.,0.);
    CUDA_WARN(cudaMemcpy(dP, &hP, sizeof(P), cudaMemcpyHostToDevice)); //*changed*
    CUDA_WARN(cudaMemcpy(&hQ, dP, sizeof(P), cudaMemcpyDeviceToHost)); //*changed*
    cout << "Copy back: " << hQ.x << "\t" << hQ.y << endl;
    dev_P_init<<< 1, 1 >>>(dP,3., 3.); //*call to kernel*
    CUDA_WARN(cudaMemcpy(&hP, dP, sizeof(P), cudaMemcpyDeviceToHost)); //*changed*
    cout << "Copy new:  " << hP.x << "\t" << hP.y << endl;
    return 0;
}

And corrected output:

Size of P: 16
dP: 0x7fff6fa2e498
Copy back: 1.2  -2.1
Copy new:  3    3


------------------
(program exited with code: 0)
Press return to continue

I believe you need to allocate memory on the gpu and then copy the structure there, you can't just use host stack memory like that — Marco A.
– Marco A., Commented Jun 27, 2014 at 20:49
&dp isn't a valid device pointer. That is the source of the error. — talonmies
– talonmies, Commented Jun 27, 2014 at 20:52
The answer given by @JackOLantern is correct. Once you fix the issues pointed out there, your dP.init() (or, dP->init()) will not do what you expect it to do in host code either. I expect it to seg fault. If your desire/intent is to run device code, you must launch a kernel. — Robert Crovella
– Robert Crovella, Commented Jun 27, 2014 at 21:21

Vitality · Accepted Answer · 2014-06-27 21:01:21Z

As already noticed by @talonmies, &dP is not a valid device pointer. Indeed, dP is a variable that resides on the host, so its address points to the host memory space. Opposite to that, when dP is a pointer, cudaMalloc will receive its value as a parameter and its value will point to a device memory space.

This is the correct version of your code:

#include <iostream>
#include <cstdio>
using namespace std;

#define CUDA_WARN(XXX) \
    do { if (XXX != cudaSuccess) cerr << "CUDA Error: " << \
    cudaGetErrorString(XXX) << ", at line " << __LINE__ \
    << endl; cudaDeviceSynchronize(); } while (0)

struct P {
    double x,y;
    __host__ __device__ void init(const double &a, const double &b) {
    x = a; y = b; }
};

int main(int argc, char **argv)
{
    P *dP;
    P hP, hQ;
    CUDA_WARN(cudaMalloc((void**) &dP, sizeof(P)));
    CUDA_WARN(cudaMemcpy(dP, &hP, sizeof(P), cudaMemcpyHostToDevice));
    CUDA_WARN(cudaMemcpy(&hQ, dP, sizeof(P), cudaMemcpyDeviceToHost));
    CUDA_WARN(cudaMemcpy(&hP, dP, sizeof(P), cudaMemcpyDeviceToHost));

    return 0;
}

Collectives™ on Stack Overflow

CUDA invalid argument when trying to copy struct to device's memory (cudaMemcpy)

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related