如何在 CUDA 中为一维数组使用纹理内存

How to use texture memory for 1D array in CUDA

我写了下面的代码来查看如何为 1D 使用纹理内存 array.but tex1D 函数没有从相应线程的数组中获取值 id.Please 更正此代码并告诉我如何使用纹理高效且有效地存储一维数组。

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }
    int main()
    {
    float *a,*b;
    float *d_a,*d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,5>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<5;i++)
        printf("%f\t",b[i]);
      cudaDestroyTextureObject(texObj); 
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }

至少有 2 个问题:

  1. 最后你只是从设备向主机复制一个浮点数:

    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
                     ^^^^^^^^^^^^^
    

    如果要打印 5 个值,则应将 5 个值复制回去:

    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);
    
  2. 您选择了normalized coordinates:

    texDesc.normalizedCoords = 1;
    

    这意味着您应该传递 0 到 1 之间的浮点坐标作为索引,而不是 0 到 4 之间的整数坐标:

     b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
                                        ^^^^^^^^^^^
    

    改用这样的东西:

     b[threadIdx.x]=tex1D<float>(texObj, ((float)threadIdx.x/5.0f));
    

通过这些更改,我得到了合理的结果。这是一个完整的代码:

$ cat t3.cu
#include <stdio.h>

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,((float)(threadIdx.x+1)/5.0f));

    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }


int main()
    {
    float *a,*b;
    float *d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,4>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<4;i++)
        printf("%f\t",b[i]);
      printf("\n");
      cudaDestroyTextureObject(texObj);
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }
$ nvcc -arch=sm_61 -o t3 t3.cu
$ cuda-memcheck ./t3
========= CUDA-MEMCHECK
0.500000        1.500000        2.500000        3.500000
========= ERROR SUMMARY: 0 errors
$

请注意,我确实进行了一些其他更改。特别是,我已经调整了您的样本点以及样本数量,以选择在您拥有的 5 个数据点 (0、1、2、3、4) 中的每一个之间线性插值的样本点,从而产生总输出4 个数量(0.5、1.5、2.5、3.5)代表 5 个数据点之间的中点。

如果您想了解有关规范化坐标索引的更多信息,the programming guide as are other concepts such as border modes and the like. Furthermore, there are various CUDA sample codes 中介绍了如何正确使用纹理。