如何在 CUDA 中为一维数组使用纹理内存
How to use texture memory for 1D array in CUDA
我写了下面的代码来查看如何为 1D 使用纹理内存 array.but tex1D 函数没有从相应线程的数组中获取值 id.Please 更正此代码并告诉我如何使用纹理高效且有效地存储一维数组。
__global__ void sum(float *b,cudaTextureObject_t texObj)
{
b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
//printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
}
int main()
{
float *a,*b;
float *d_a,*d_b;
int i;
a=(float*)malloc(sizeof(float)*5);
b=(float*)malloc(sizeof(float)*5);
for(i=0;i<5;i++)
a[i]=i;
cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, 5, 0);
cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
// Create texture object
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
cudaMalloc(&d_b, 5* sizeof(float));
sum<<<1,5>>>(d_b,texObj);
// Free device memory
cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
for(i=0;i<5;i++)
printf("%f\t",b[i]);
cudaDestroyTextureObject(texObj);
cudaFreeArray(cuArray);
cudaFree(d_b);
return 0;
}
至少有 2 个问题:
最后你只是从设备向主机复制一个浮点数:
cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
^^^^^^^^^^^^^
如果要打印 5 个值,则应将 5 个值复制回去:
cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);
-
texDesc.normalizedCoords = 1;
这意味着您应该传递 0 到 1 之间的浮点坐标作为索引,而不是 0 到 4 之间的整数坐标:
b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
^^^^^^^^^^^
改用这样的东西:
b[threadIdx.x]=tex1D<float>(texObj, ((float)threadIdx.x/5.0f));
通过这些更改,我得到了合理的结果。这是一个完整的代码:
$ cat t3.cu
#include <stdio.h>
__global__ void sum(float *b,cudaTextureObject_t texObj)
{
b[threadIdx.x]=tex1D<float>(texObj,((float)(threadIdx.x+1)/5.0f));
//printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
}
int main()
{
float *a,*b;
float *d_b;
int i;
a=(float*)malloc(sizeof(float)*5);
b=(float*)malloc(sizeof(float)*5);
for(i=0;i<5;i++)
a[i]=i;
cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, 5, 0);
cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
// Create texture object
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
cudaMalloc(&d_b, 5* sizeof(float));
sum<<<1,4>>>(d_b,texObj);
// Free device memory
cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);
for(i=0;i<4;i++)
printf("%f\t",b[i]);
printf("\n");
cudaDestroyTextureObject(texObj);
cudaFreeArray(cuArray);
cudaFree(d_b);
return 0;
}
$ nvcc -arch=sm_61 -o t3 t3.cu
$ cuda-memcheck ./t3
========= CUDA-MEMCHECK
0.500000 1.500000 2.500000 3.500000
========= ERROR SUMMARY: 0 errors
$
请注意,我确实进行了一些其他更改。特别是,我已经调整了您的样本点以及样本数量,以选择在您拥有的 5 个数据点 (0、1、2、3、4) 中的每一个之间线性插值的样本点,从而产生总输出4 个数量(0.5、1.5、2.5、3.5)代表 5 个数据点之间的中点。
如果您想了解有关规范化坐标索引的更多信息,the programming guide as are other concepts such as border modes and the like. Furthermore, there are various CUDA sample codes 中介绍了如何正确使用纹理。
我写了下面的代码来查看如何为 1D 使用纹理内存 array.but tex1D 函数没有从相应线程的数组中获取值 id.Please 更正此代码并告诉我如何使用纹理高效且有效地存储一维数组。
__global__ void sum(float *b,cudaTextureObject_t texObj)
{
b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
//printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
}
int main()
{
float *a,*b;
float *d_a,*d_b;
int i;
a=(float*)malloc(sizeof(float)*5);
b=(float*)malloc(sizeof(float)*5);
for(i=0;i<5;i++)
a[i]=i;
cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, 5, 0);
cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
// Create texture object
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
cudaMalloc(&d_b, 5* sizeof(float));
sum<<<1,5>>>(d_b,texObj);
// Free device memory
cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
for(i=0;i<5;i++)
printf("%f\t",b[i]);
cudaDestroyTextureObject(texObj);
cudaFreeArray(cuArray);
cudaFree(d_b);
return 0;
}
至少有 2 个问题:
最后你只是从设备向主机复制一个浮点数:
cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost); ^^^^^^^^^^^^^
如果要打印 5 个值,则应将 5 个值复制回去:
cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);
-
texDesc.normalizedCoords = 1;
这意味着您应该传递 0 到 1 之间的浮点坐标作为索引,而不是 0 到 4 之间的整数坐标:
b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x); ^^^^^^^^^^^
改用这样的东西:
b[threadIdx.x]=tex1D<float>(texObj, ((float)threadIdx.x/5.0f));
通过这些更改,我得到了合理的结果。这是一个完整的代码:
$ cat t3.cu
#include <stdio.h>
__global__ void sum(float *b,cudaTextureObject_t texObj)
{
b[threadIdx.x]=tex1D<float>(texObj,((float)(threadIdx.x+1)/5.0f));
//printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
}
int main()
{
float *a,*b;
float *d_b;
int i;
a=(float*)malloc(sizeof(float)*5);
b=(float*)malloc(sizeof(float)*5);
for(i=0;i<5;i++)
a[i]=i;
cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, 5, 0);
cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
// Create texture object
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
cudaMalloc(&d_b, 5* sizeof(float));
sum<<<1,4>>>(d_b,texObj);
// Free device memory
cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);
for(i=0;i<4;i++)
printf("%f\t",b[i]);
printf("\n");
cudaDestroyTextureObject(texObj);
cudaFreeArray(cuArray);
cudaFree(d_b);
return 0;
}
$ nvcc -arch=sm_61 -o t3 t3.cu
$ cuda-memcheck ./t3
========= CUDA-MEMCHECK
0.500000 1.500000 2.500000 3.500000
========= ERROR SUMMARY: 0 errors
$
请注意,我确实进行了一些其他更改。特别是,我已经调整了您的样本点以及样本数量,以选择在您拥有的 5 个数据点 (0、1、2、3、4) 中的每一个之间线性插值的样本点,从而产生总输出4 个数量(0.5、1.5、2.5、3.5)代表 5 个数据点之间的中点。
如果您想了解有关规范化坐标索引的更多信息,the programming guide as are other concepts such as border modes and the like. Furthermore, there are various CUDA sample codes 中介绍了如何正确使用纹理。