如何在 CUDA 中为一维数组使用纹理内存

Question

我写了下面的代码来查看如何为 1D 使用纹理内存 array.but tex1D 函数没有从相应线程的数组中获取值 id.Please 更正此代码并告诉我如何使用纹理高效且有效地存储一维数组。

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }
    int main()
    {
    float *a,*b;
    float *d_a,*d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,5>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<5;i++)
        printf("%f\t",b[i]);
      cudaDestroyTextureObject(texObj); 
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }

Answer 1

至少有 2 个问题：

最后你只是从设备向主机复制一个浮点数：

cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
                 ^^^^^^^^^^^^^

如果要打印 5 个值，则应将 5 个值复制回去：

cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);

您选择了normalized coordinates:

texDesc.normalizedCoords = 1;

这意味着您应该传递 0 到 1 之间的浮点坐标作为索引，而不是 0 到 4 之间的整数坐标：

 b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
                                    ^^^^^^^^^^^

改用这样的东西：

 b[threadIdx.x]=tex1D<float>(texObj, ((float)threadIdx.x/5.0f));

通过这些更改，我得到了合理的结果。这是一个完整的代码：

$ cat t3.cu
#include <stdio.h>

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,((float)(threadIdx.x+1)/5.0f));

    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }


int main()
    {
    float *a,*b;
    float *d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,4>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<4;i++)
        printf("%f\t",b[i]);
      printf("\n");
      cudaDestroyTextureObject(texObj);
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }
$ nvcc -arch=sm_61 -o t3 t3.cu
$ cuda-memcheck ./t3
========= CUDA-MEMCHECK
0.500000        1.500000        2.500000        3.500000
========= ERROR SUMMARY: 0 errors
$

请注意，我确实进行了一些其他更改。特别是，我已经调整了您的样本点以及样本数量，以选择在您拥有的 5 个数据点 (0、1、2、3、4) 中的每一个之间线性插值的样本点，从而产生总输出4 个数量（0.5、1.5、2.5、3.5）代表 5 个数据点之间的中点。

如果您想了解有关规范化坐标索引的更多信息，the programming guide as are other concepts such as border modes and the like. Furthermore, there are various CUDA sample codes 中介绍了如何正确使用纹理。

如何在 CUDA 中为一维数组使用纹理内存

How to use texture memory for 1D array in CUDA

cuda

cuda-arrays