Cuda理解纹理获取

Cuda understanding texture fetching

我写了一个简单的代码示例我有 cuda 版本 6.5

#include <iostream>
#include <cstdio>
#include "cudaerror.h"
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

texture<unsigned int, cudaTextureType2D, cudaReadModeElementType> texRef;

__global__
void kernel1(int64_t N){
    int64_t tid_x = threadIdx.x;
    int64_t tid_y = threadIdx.y;

    if(tid_x < N && tid_y < N){
        unsigned int temp = tex2D(texRef, tid_x, tid_y);
        printf("tid_x: %d, tid_y: %d, tex: %d\n", tid_x, tid_y, temp);
    }
}

void alloc_darrays(cudaArray* &d_adj_mat){
    unsigned int* adj_mat = new unsigned int[9]();

    adj_mat[3] = 1;
    adj_mat[4] = 1;
    adj_mat[5] = 1;
    adj_mat[0] = 1;

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindUnsigned);

    gpuErrchk( cudaMallocArray(&d_adj_mat, &channelDesc, 3, 3) );
    gpuErrchk( cudaMemcpyToArray(d_adj_mat, 0, 0, adj_mat, 9*sizeof(int), cudaMemcpyHostToDevice) );

    texRef.addressMode[0] = cudaAddressModeBorder;
    texRef.addressMode[1] = cudaAddressModeBorder;
    texRef.filterMode = cudaFilterModePoint;
    texRef.normalized = false;

    gpuErrchk( cudaBindTextureToArray(texRef, d_adj_mat, channelDesc) );
}

int main(){
    cudaArray* d_adj_mat;
    alloc_darrays(d_adj_mat);
    dim3 numthreads(3,3);
    kernel1<<<1,numthreads>>>(3);
    cudaDeviceSynchronize();
    return 0;
}

产生的输出是:

tid_x: 0, tid_y: 0, tex: 0
tid_x: 1, tid_y: 0, tex: 0
tid_x: 2, tid_y: 0, tex: 0
tid_x: 0, tid_y: 0, tex: 1
tid_x: 1, tid_y: 0, tex: 1
tid_x: 2, tid_y: 0, tex: 1
tid_x: 0, tid_y: 0, tex: 2
tid_x: 1, tid_y: 0, tex: 2
tid_x: 2, tid_y: 0, tex: 2

我无法理解 2 的来源 为什么 tid_y 总是 0.

代码有问题还是我对纹理的理解有误?

您看到的是使用不正确内核 printf 格式字符串的产物。

您可能想要做的是:

printf("tid_x: %ld, tid_y: %ld, tex: %u\n", tid_x, tid_y, temp);

因为您的 tid_xtid_y 变量是长整数。如果你改变它,内核应该按预期工作。