将 jpeg 图像内容作为 CUDA 纹理传递
Pass a jpeg image contents as a CUDA Texture
想将 jpeg 文件(3 字节 RGB)的内容作为纹理传递给 CUDA 内核,但出现编译错误
a pointer to a bound function may only be used to call the function
在 value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
和其余 tex2D()
次通话中。
错误的可能原因是什么?
创建纹理的主机端代码:
cudaArray* cudaArray;
cudaTextureObject_t textureObject{};
{
const static uint32_t bytesPerPixel{ 3u };
uint8_t* pHostData;
int32_t textureWidth, textureHeight;
uint32_t bytesPerScanline;
cudaChannelFormatDesc channelFormatDesc;
cudaResourceDesc resourceDesc{};
cudaTextureDesc textureDesc{};
int32_t componentsPerPixel = bytesPerPixel;
pHostData = stbi_load(textureFilename.c_str(), &textureWidth, &textureHeight, &componentsPerPixel, componentsPerPixel);
if (nullptr == pHostData) {
std::cerr << "ERROR: Could not load texture image file '" << textureFilename << std::endl;
return;
}
bytesPerScanline = bytesPerPixel * textureWidth;
channelFormatDesc = cudaCreateChannelDesc<uint8_t>();
checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cudaArray;
textureDesc.normalizedCoords = true;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
STBI_FREE(pHostData);
}
设备端代码:
class imageTexture {
public:
__device__ imageTexture(cudaTextureObject_t tex) :_texture(tex) {}
__device__ virtual vec3 value(float u, float v, const vec3& p) const {
vec3 value;
u *= 3;
value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
u++;
value.y = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
u++;
value.z = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
return value;
}
private:
cudaTextureObject_t _texture;
};
更改了设备端功能,但错误仍然存在:
class imageTexture :public textureX {
public:
__device__ imageTexture(cudaTextureObject_t tex) :_text(tex) {}
__device__ virtual vec3 value(float u, float v, const vec3& p) const override {
vec3 val;
u *= 3;
val.x = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
u++;
val.y = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
u++;
val.z = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
return val;
}
private:
cudaTextureObject_t _text;
};
我已经编写了一个新的测试程序并计划以此为基础进行构建。这个想法是让每个线程从纹理中读取 3 个值并将其写回缓冲区。只有第一个三连音是正确的。我的纹理查找中是否有任何不一致的地方:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>
#include <stdio.h>
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
std::cerr << cudaGetErrorString(result) << std::endl;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < width) && (y < height)) {
float u = (float)x / (float)width;
float v = (float)y / (float)height;
pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, 3*u, v);
pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, 3*u + 1, v);
pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, 3*u + 2, v);
}
}
void cudaTex() {
const uint32_t bytesPerPixel{ 3u };
const uint32_t textureWidth = 1024u;
const uint32_t textureHeight = 512u;
uint32_t bytesPerScanline;
bytesPerScanline = bytesPerPixel * textureWidth;
cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cudaArray;
checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
std::srand(std::time(nullptr));
for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
pHostData[idx] = std::rand();
checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
cudaResourceDesc resourceDesc{};
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cudaArray;
cudaTextureDesc textureDesc{};
textureDesc.normalizedCoords = false;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t textureObject{};
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
dim3 dimBlock(8u, 8u, 1u);
dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
uint8_t* dOutput{ nullptr };
checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
if (hOutput[jdx] != pHostData[jdx])
std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
}
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
}
checkCudaErrors(cudaDestroyTextureObject(textureObject));
checkCudaErrors(cudaFree(dOutput));
checkCudaErrors(cudaFreeArray(cudaArray));
delete[] hOutput;
delete[] pHostData;
}
int main() {
cudaTex();
return 0;
}
切换到在内核中协调的整数解决了问题
原问题的解决
原来
a pointer to a bound function may only be used to call the function
错误是由 vec3
class 具有 getter 函数 x()
而不是名为 x
的成员变量引起的。所以代码尝试使用 getter 函数作为 l-value!!!
您现在发布的代码存在几个问题:
经过评论区的讨论,希望大家能搞清楚这行代码有什么问题:
cudaArray* cudaArray;
您的内核代码似乎在尝试传递规范化的 float
坐标,但操作不正确。这里有几个问题:你的 x 标准化正在考虑 textureWidth
但它应该在 3*textureWidth
上完成(即 bytesPerScanline
)。尽管您称纹理的宽度为 textureWidth
,但实际上它是 3*textureWidth
。此外,这种方式的纹理通常偏移 0.5。最后,你这样做:
textureDesc.normalizedCoords = false;
但是如果你想使用 float
坐标(似乎是你想要的)你应该这样做:
textureDesc.normalizedCoords = true;
在您解决所有这些问题后,您将 运行 陷入非 CUDA 问题。您正在修改这些指针:
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
修改后尝试删除:
delete[] hOutput;
delete[] pHostData;
这将无法正常工作。
这是解决了上述问题的修改代码,对我来说似乎 运行 正确:
$ cat t7.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>
#include <stdio.h>
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
std::cerr << cudaGetErrorString(result) << std::endl;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
const float pix_offset = 0.5;
if ((x < width) && (y < height)) {
float u = (float)(3*x+pix_offset) / (float)(3*width);
float v = (float)y / (float)height;
pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, u, v);
u = (float)(3*x+1+pix_offset) / (float)(3*width);
pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, u, v);
u = (float)(3*x+2+pix_offset) / (float)(3*width);
pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, u, v);
}
}
void cudaTex() {
const uint32_t bytesPerPixel{ 3u };
const uint32_t textureWidth = 1024u;
const uint32_t textureHeight = 512u;
uint32_t bytesPerScanline;
bytesPerScanline = bytesPerPixel * textureWidth;
cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cArray;
checkCudaErrors(cudaMallocArray(&cArray, &channelFormatDesc, bytesPerScanline, textureHeight));
uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
std::srand(std::time(nullptr));
for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
pHostData[idx] = std::rand();
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
cudaResourceDesc resourceDesc{};
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cArray;
cudaTextureDesc textureDesc{};
textureDesc.normalizedCoords = true;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t textureObject{};
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
dim3 dimBlock(8u, 8u, 1u);
dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
uint8_t* dOutput{ nullptr };
checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
uint8_t *my_hOutput = hOutput;
uint8_t *my_pHostData = pHostData;
for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
if (hOutput[jdx] != pHostData[jdx]){
std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
return;}
}
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
}
checkCudaErrors(cudaDestroyTextureObject(textureObject));
checkCudaErrors(cudaFree(dOutput));
checkCudaErrors(cudaFreeArray(cArray));
delete[] my_hOutput;
delete[] my_pHostData;
}
int main() {
cudaTex();
return 0;
}
$ nvcc -o t7 t7.cu -std=c++11
t7.cu: In function ‘void cudaTex()’:
t7.cu:56:12: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
^
t7.cu:56:131: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
^
$ cuda-memcheck ./t7
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
我并不是说上面的代码没有缺陷。这主要是你的代码。我只是指出一些我发现的东西。您可以阅读有关如何解决弃用警告的信息 here。
想将 jpeg 文件(3 字节 RGB)的内容作为纹理传递给 CUDA 内核,但出现编译错误
a pointer to a bound function may only be used to call the function
在 value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
和其余 tex2D()
次通话中。
错误的可能原因是什么?
创建纹理的主机端代码:
cudaArray* cudaArray;
cudaTextureObject_t textureObject{};
{
const static uint32_t bytesPerPixel{ 3u };
uint8_t* pHostData;
int32_t textureWidth, textureHeight;
uint32_t bytesPerScanline;
cudaChannelFormatDesc channelFormatDesc;
cudaResourceDesc resourceDesc{};
cudaTextureDesc textureDesc{};
int32_t componentsPerPixel = bytesPerPixel;
pHostData = stbi_load(textureFilename.c_str(), &textureWidth, &textureHeight, &componentsPerPixel, componentsPerPixel);
if (nullptr == pHostData) {
std::cerr << "ERROR: Could not load texture image file '" << textureFilename << std::endl;
return;
}
bytesPerScanline = bytesPerPixel * textureWidth;
channelFormatDesc = cudaCreateChannelDesc<uint8_t>();
checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cudaArray;
textureDesc.normalizedCoords = true;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
STBI_FREE(pHostData);
}
设备端代码:
class imageTexture {
public:
__device__ imageTexture(cudaTextureObject_t tex) :_texture(tex) {}
__device__ virtual vec3 value(float u, float v, const vec3& p) const {
vec3 value;
u *= 3;
value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
u++;
value.y = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
u++;
value.z = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
return value;
}
private:
cudaTextureObject_t _texture;
};
更改了设备端功能,但错误仍然存在:
class imageTexture :public textureX {
public:
__device__ imageTexture(cudaTextureObject_t tex) :_text(tex) {}
__device__ virtual vec3 value(float u, float v, const vec3& p) const override {
vec3 val;
u *= 3;
val.x = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
u++;
val.y = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
u++;
val.z = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
return val;
}
private:
cudaTextureObject_t _text;
};
我已经编写了一个新的测试程序并计划以此为基础进行构建。这个想法是让每个线程从纹理中读取 3 个值并将其写回缓冲区。只有第一个三连音是正确的。我的纹理查找中是否有任何不一致的地方:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>
#include <stdio.h>
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
std::cerr << cudaGetErrorString(result) << std::endl;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < width) && (y < height)) {
float u = (float)x / (float)width;
float v = (float)y / (float)height;
pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, 3*u, v);
pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, 3*u + 1, v);
pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, 3*u + 2, v);
}
}
void cudaTex() {
const uint32_t bytesPerPixel{ 3u };
const uint32_t textureWidth = 1024u;
const uint32_t textureHeight = 512u;
uint32_t bytesPerScanline;
bytesPerScanline = bytesPerPixel * textureWidth;
cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cudaArray;
checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
std::srand(std::time(nullptr));
for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
pHostData[idx] = std::rand();
checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
cudaResourceDesc resourceDesc{};
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cudaArray;
cudaTextureDesc textureDesc{};
textureDesc.normalizedCoords = false;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t textureObject{};
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
dim3 dimBlock(8u, 8u, 1u);
dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
uint8_t* dOutput{ nullptr };
checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
if (hOutput[jdx] != pHostData[jdx])
std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
}
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
}
checkCudaErrors(cudaDestroyTextureObject(textureObject));
checkCudaErrors(cudaFree(dOutput));
checkCudaErrors(cudaFreeArray(cudaArray));
delete[] hOutput;
delete[] pHostData;
}
int main() {
cudaTex();
return 0;
}
切换到在内核中协调的整数解决了问题
原问题的解决
原来
a pointer to a bound function may only be used to call the function
错误是由 vec3
class 具有 getter 函数 x()
而不是名为 x
的成员变量引起的。所以代码尝试使用 getter 函数作为 l-value!!!
您现在发布的代码存在几个问题:
经过评论区的讨论,希望大家能搞清楚这行代码有什么问题:
cudaArray* cudaArray;
您的内核代码似乎在尝试传递规范化的
float
坐标,但操作不正确。这里有几个问题:你的 x 标准化正在考虑textureWidth
但它应该在3*textureWidth
上完成(即bytesPerScanline
)。尽管您称纹理的宽度为textureWidth
,但实际上它是3*textureWidth
。此外,这种方式的纹理通常偏移 0.5。最后,你这样做:textureDesc.normalizedCoords = false;
但是如果你想使用
float
坐标(似乎是你想要的)你应该这样做:textureDesc.normalizedCoords = true;
在您解决所有这些问题后,您将 运行 陷入非 CUDA 问题。您正在修改这些指针:
hOutput += bytesPerScanline; pHostData += bytesPerScanline;
修改后尝试删除:
delete[] hOutput; delete[] pHostData;
这将无法正常工作。
这是解决了上述问题的修改代码,对我来说似乎 运行 正确:
$ cat t7.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>
#include <stdio.h>
#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
if (result) {
std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
std::cerr << cudaGetErrorString(result) << std::endl;
// Make sure we call CUDA Device Reset before exiting
cudaDeviceReset();
exit(99);
}
}
__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
const float pix_offset = 0.5;
if ((x < width) && (y < height)) {
float u = (float)(3*x+pix_offset) / (float)(3*width);
float v = (float)y / (float)height;
pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, u, v);
u = (float)(3*x+1+pix_offset) / (float)(3*width);
pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, u, v);
u = (float)(3*x+2+pix_offset) / (float)(3*width);
pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, u, v);
}
}
void cudaTex() {
const uint32_t bytesPerPixel{ 3u };
const uint32_t textureWidth = 1024u;
const uint32_t textureHeight = 512u;
uint32_t bytesPerScanline;
bytesPerScanline = bytesPerPixel * textureWidth;
cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cArray;
checkCudaErrors(cudaMallocArray(&cArray, &channelFormatDesc, bytesPerScanline, textureHeight));
uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
std::srand(std::time(nullptr));
for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
pHostData[idx] = std::rand();
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
cudaResourceDesc resourceDesc{};
resourceDesc.resType = cudaResourceTypeArray;
resourceDesc.res.array.array = cArray;
cudaTextureDesc textureDesc{};
textureDesc.normalizedCoords = true;
textureDesc.filterMode = cudaFilterModePoint;
textureDesc.addressMode[0] = cudaAddressModeWrap;
textureDesc.addressMode[1] = cudaAddressModeWrap;
textureDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t textureObject{};
checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
dim3 dimBlock(8u, 8u, 1u);
dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
uint8_t* dOutput{ nullptr };
checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
uint8_t *my_hOutput = hOutput;
uint8_t *my_pHostData = pHostData;
for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
if (hOutput[jdx] != pHostData[jdx]){
std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
return;}
}
hOutput += bytesPerScanline;
pHostData += bytesPerScanline;
}
checkCudaErrors(cudaDestroyTextureObject(textureObject));
checkCudaErrors(cudaFree(dOutput));
checkCudaErrors(cudaFreeArray(cArray));
delete[] my_hOutput;
delete[] my_pHostData;
}
int main() {
cudaTex();
return 0;
}
$ nvcc -o t7 t7.cu -std=c++11
t7.cu: In function ‘void cudaTex()’:
t7.cu:56:12: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
^
t7.cu:56:131: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
^
$ cuda-memcheck ./t7
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
我并不是说上面的代码没有缺陷。这主要是你的代码。我只是指出一些我发现的东西。您可以阅读有关如何解决弃用警告的信息 here。