0

I wrote following code to see how to use texture memory for 1D array.but tex1D function is not fetching the value from array for corresponding thread id.Please correct this code and tell me how to use texture memory for 1D array efficiently and effectively.

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }
    int main()
    {
    float *a,*b;
    float *d_a,*d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,5>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<5;i++)
        printf("%f\t",b[i]);
      cudaDestroyTextureObject(texObj); 
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }
1
  • 1
    Welcome to Stack Overflow! While we can potentially point out obvious errors in your code, we are not a debugging service. Please consider reading some basic debugging techniques to help you to either solve the problem yourself, or narrow your problem down to something specific enough for this site. Commented Oct 1, 2016 at 19:14

1 Answer 1

3

There are at least 2 issues:

  1. You are only copying back one float quantity from device to host at the end:

    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
                     ^^^^^^^^^^^^^
    

    if you want to print 5 values, you should copy 5 values back:

    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);
    
  2. You have selected normalized coordinates:

    texDesc.normalizedCoords = 1;
    

    this means you should be passing a floating point coordinate between 0 and 1 as your index, not an integer coordinate from 0 to 4:

     b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
                                        ^^^^^^^^^^^
    

    use something like this instead:

     b[threadIdx.x]=tex1D<float>(texObj, ((float)threadIdx.x/5.0f));
    

with those changes, I get sensible results. Here's a fully worked code:

$ cat t3.cu
#include <stdio.h>

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,((float)(threadIdx.x+1)/5.0f));

    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }


int main()
    {
    float *a,*b;
    float *d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,4>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<4;i++)
        printf("%f\t",b[i]);
      printf("\n");
      cudaDestroyTextureObject(texObj);
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }
$ nvcc -arch=sm_61 -o t3 t3.cu
$ cuda-memcheck ./t3
========= CUDA-MEMCHECK
0.500000        1.500000        2.500000        3.500000
========= ERROR SUMMARY: 0 errors
$

Note that I did make some other changes. In particular, I've adjusted your sample points as well as the sample quantity to choose sample points that are linearly interpolated halfway between each of the 5 data points you have (0, 1, 2, 3, 4) yielding a total output of 4 quantities (0.5, 1.5, 2.5, 3.5) representing the midpoints between your 5 datapoints.

If you want to learn more about normalized coordinate indexing, this is covered in the programming guide as are other concepts such as border modes and the like. Furthermore, there are various CUDA sample codes that demonstrate proper use of textures.

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.