1

I tried both cudaMemcpy2DFromArray and cudaMemcpy2D, but neither of them work correctly. By not working correctly, I mean that the GpuMat did copied something from the cudaArray but the horizontal scale was wrong.

The code snippet is the following:

cudaArray *colorArr;
checkCudaErrors( cudaGraphicsMapResources( 1, &cudaResourceColor, 0 ) );
checkCudaErrors( cudaGraphicsSubResourceGetMappedArray( &colorArr, cudaResourceColor, 0, 0 ) );

cv::gpu::GpuMat gpuColorMat(Size(w,h), CV_32FC3);   

// Tried method 1: the following didn't work correctly
checkCudaErrors( cudaMemcpy2DFromArray( gpuColorMat.data, gpuColorMat.step, colorArr, 
    0, 0, gpuColorMat.cols*sizeof(float3), gpuColorMat.rows, cudaMemcpyDeviceToDevice ) );

// Tried method 2: also didn't work correctly. Same error as the first method.
float3 *cuarr; 
checkCudaErrors( cudaMalloc( (void**)&cuarr, w*h*sizeof(float3) ) );
checkCudaErrors( cudaMemcpy2DFromArray( cuarr, w*h, colorArr, 0, 0, w*sizeof(float3), h, cudaMemcpyDeviceToDevice ) );
checkCudaErrors( cudaMemcpy2D( (float*)gpuColorMat.data, gpuColorMat.step, cuarr, w*sizeof(float3), w*sizeof(float3), h, cudaMemcpyDeviceToDevice ) );

// unmap buffer objects
checkCudaErrors( cudaGraphicsUnmapResources( 1, &cudaResourceColor, 0 ) );

Can anyone help me with this?

2 Answers 2

2

I finally made it work. I'm sharing my code below:

.cu file: do the device array copy. The rendered image is bound to the texture inTex, and it is copied to the destination float3 *dst.

texture<float4, 2, cudaReadModeElementType> inTex;

__global__ void CuDeviceArrayCopyFromTexture( float3* dst, int dstStep, int width, int height )     
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if ( x > width || y > height ) return;

    float4 res = tex2D(inTex, x, y);
    float3* row_y = (float3*)((char*)dst + y * dstStep);
    row_y[x] = make_float3(res.x, res.y, res.z);
}

// round up n/m
inline int iDivUp(int n, int m)
{
    return (n + m - 1) / m;
}

void DeviceArrayCopyFromTexture( float3* dst, int dstStep, int width, int height ) 
{
    dim3 threads( 64, 1 );
    dim3 grid = dim3( iDivUp( width, threads.x ), height/threads.y );
    CuDeviceArrayCopyFromTexture <<< grid, threads >>> ( dst, dstStep, width, height );
}

void BindToTexture( cudaArray *cuArr )
{
     checkCudaErrors( cudaBindTextureToArray( inTex, cuArr ) );
}

.cpp file: set up gl render texture, bind to cuda texture and call device array copy method.

glActiveTexture(GL_TEXTURE0);
glGenTextures(1, &fboColorTex);
glBindTexture(GL_TEXTURE_2D, fboColorTex);
// I used RGB16F and RGB32F, both not working. So I changed to GL_RGBA16F and it could be mapped to cudaArray as float4 element.
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
checkCudaErrors( cudaGraphicsGLRegisterImage( &cudaResourceColor, fboColorTex, GL_TEXTURE_2D, cudaGraphicsMapFlagsReadOnly ) );

extern void BindToTexture( cudaArray *cuArr );
extern void DeviceArrayCopyFromTexture( float3* dst, int dstStep, int width, int height );

static GpuMat gpuMat( Size(w,h), CV_32FC3 );
cudaArray *cuArr;

// Copy color buffer
checkCudaErrors( cudaGraphicsMapResources( 1, &cudaResourceColor, 0 ) );
checkCudaErrors( cudaGraphicsSubResourceGetMappedArray( &cuArr, cudaResourceColor, 0, 0 ) );

BindToTexture( cuArr );
DeviceArrayCopyFromTexture( (float3*)gpuMat.data, gpuMat.step, gpuMat.cols, gpuMat.rows  );

checkCudaErrors( cudaGraphicsUnmapResources( 1, &cudaResourceColor, 0 ) );

References:

  1. http://answers.opencv.org/question/12958/read-rendered-images-using-gpumat-and-cuda/

  2. CUDA Samples\v5.5\3_Imaging\postProcessGL

Sign up to request clarification or add additional context in comments.

1 Comment

Since you are doing copy operation anyway, wouldn't it be easier (that is no need for writing a custom CUDA kernel) if you use cudaMemcpy2DFromArray() to convert the opaque CUDA memory block (represented by cudaArray to a flat CUDA memory block (what the function returns) and then simply assign that copy to the GpuMat? I am also tackling similar case currently, although initially I wanted to avoid a dependency on OpenCV. However, doing image manipulation on a raw texture is a nightmare and literally reinventing the wheel. :D
1

GpuMat::step is in number of elements while pitch is in bytes, so try to change from

cudaMemcpy2DFromArray( gpuColorMat.data, gpuColorMat.step,                  colorArr, 0, 0, gpuColorMat.cols*sizeof(float3), gpuColorMat.rows, cudaMemcpyDeviceToDevice );

to

cudaMemcpy2DFromArray( gpuColorMat.data, gpuColorMat.step * sizeof(float3), colorArr, 0, 0, gpuColorMat.cols*sizeof(float3), gpuColorMat.rows, cudaMemcpyDeviceToDevice );

1 Comment

I tried your suggested method. It didn't work either. Thank you all the same!

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.