Cuda理解纹理获取
Cuda understanding texture fetching
我写了一个简单的代码示例我有 cuda 版本 6.5
#include <iostream>
#include <cstdio>
#include "cudaerror.h"
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
texture<unsigned int, cudaTextureType2D, cudaReadModeElementType> texRef;
__global__
void kernel1(int64_t N){
int64_t tid_x = threadIdx.x;
int64_t tid_y = threadIdx.y;
if(tid_x < N && tid_y < N){
unsigned int temp = tex2D(texRef, tid_x, tid_y);
printf("tid_x: %d, tid_y: %d, tex: %d\n", tid_x, tid_y, temp);
}
}
void alloc_darrays(cudaArray* &d_adj_mat){
unsigned int* adj_mat = new unsigned int[9]();
adj_mat[3] = 1;
adj_mat[4] = 1;
adj_mat[5] = 1;
adj_mat[0] = 1;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindUnsigned);
gpuErrchk( cudaMallocArray(&d_adj_mat, &channelDesc, 3, 3) );
gpuErrchk( cudaMemcpyToArray(d_adj_mat, 0, 0, adj_mat, 9*sizeof(int), cudaMemcpyHostToDevice) );
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.addressMode[1] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModePoint;
texRef.normalized = false;
gpuErrchk( cudaBindTextureToArray(texRef, d_adj_mat, channelDesc) );
}
int main(){
cudaArray* d_adj_mat;
alloc_darrays(d_adj_mat);
dim3 numthreads(3,3);
kernel1<<<1,numthreads>>>(3);
cudaDeviceSynchronize();
return 0;
}
产生的输出是:
tid_x: 0, tid_y: 0, tex: 0
tid_x: 1, tid_y: 0, tex: 0
tid_x: 2, tid_y: 0, tex: 0
tid_x: 0, tid_y: 0, tex: 1
tid_x: 1, tid_y: 0, tex: 1
tid_x: 2, tid_y: 0, tex: 1
tid_x: 0, tid_y: 0, tex: 2
tid_x: 1, tid_y: 0, tex: 2
tid_x: 2, tid_y: 0, tex: 2
我无法理解 2 的来源
为什么 tid_y 总是 0.
代码有问题还是我对纹理的理解有误?
您看到的是使用不正确内核 printf
格式字符串的产物。
您可能想要做的是:
printf("tid_x: %ld, tid_y: %ld, tex: %u\n", tid_x, tid_y, temp);
因为您的 tid_x
和 tid_y
变量是长整数。如果你改变它,内核应该按预期工作。
我写了一个简单的代码示例我有 cuda 版本 6.5
#include <iostream>
#include <cstdio>
#include "cudaerror.h"
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
texture<unsigned int, cudaTextureType2D, cudaReadModeElementType> texRef;
__global__
void kernel1(int64_t N){
int64_t tid_x = threadIdx.x;
int64_t tid_y = threadIdx.y;
if(tid_x < N && tid_y < N){
unsigned int temp = tex2D(texRef, tid_x, tid_y);
printf("tid_x: %d, tid_y: %d, tex: %d\n", tid_x, tid_y, temp);
}
}
void alloc_darrays(cudaArray* &d_adj_mat){
unsigned int* adj_mat = new unsigned int[9]();
adj_mat[3] = 1;
adj_mat[4] = 1;
adj_mat[5] = 1;
adj_mat[0] = 1;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindUnsigned);
gpuErrchk( cudaMallocArray(&d_adj_mat, &channelDesc, 3, 3) );
gpuErrchk( cudaMemcpyToArray(d_adj_mat, 0, 0, adj_mat, 9*sizeof(int), cudaMemcpyHostToDevice) );
texRef.addressMode[0] = cudaAddressModeBorder;
texRef.addressMode[1] = cudaAddressModeBorder;
texRef.filterMode = cudaFilterModePoint;
texRef.normalized = false;
gpuErrchk( cudaBindTextureToArray(texRef, d_adj_mat, channelDesc) );
}
int main(){
cudaArray* d_adj_mat;
alloc_darrays(d_adj_mat);
dim3 numthreads(3,3);
kernel1<<<1,numthreads>>>(3);
cudaDeviceSynchronize();
return 0;
}
产生的输出是:
tid_x: 0, tid_y: 0, tex: 0
tid_x: 1, tid_y: 0, tex: 0
tid_x: 2, tid_y: 0, tex: 0
tid_x: 0, tid_y: 0, tex: 1
tid_x: 1, tid_y: 0, tex: 1
tid_x: 2, tid_y: 0, tex: 1
tid_x: 0, tid_y: 0, tex: 2
tid_x: 1, tid_y: 0, tex: 2
tid_x: 2, tid_y: 0, tex: 2
我无法理解 2 的来源 为什么 tid_y 总是 0.
代码有问题还是我对纹理的理解有误?
您看到的是使用不正确内核 printf
格式字符串的产物。
您可能想要做的是:
printf("tid_x: %ld, tid_y: %ld, tex: %u\n", tid_x, tid_y, temp);
因为您的 tid_x
和 tid_y
变量是长整数。如果你改变它,内核应该按预期工作。