将 jpeg 图像内容作为 CUDA 纹理传递

Pass a jpeg image contents as a CUDA Texture

想将 jpeg 文件(3 字节 RGB)的内容作为纹理传递给 CUDA 内核,但出现编译错误

a pointer to a bound function may only be used to call the function

value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f; 和其余 tex2D() 次通话中。

错误的可能原因是什么?

创建纹理的主机端代码:

cudaArray* cudaArray;
    cudaTextureObject_t textureObject{};
    {
        const static uint32_t bytesPerPixel{ 3u };
        uint8_t* pHostData;
        int32_t textureWidth, textureHeight;
        uint32_t bytesPerScanline;
        cudaChannelFormatDesc channelFormatDesc;
        cudaResourceDesc  resourceDesc{};
        cudaTextureDesc textureDesc{};
        int32_t componentsPerPixel = bytesPerPixel;
        pHostData = stbi_load(textureFilename.c_str(), &textureWidth, &textureHeight, &componentsPerPixel, componentsPerPixel);
        if (nullptr == pHostData) {
            std::cerr << "ERROR: Could not load texture image file '" << textureFilename << std::endl;
            return;
        }
        bytesPerScanline = bytesPerPixel * textureWidth;

        channelFormatDesc = cudaCreateChannelDesc<uint8_t>();
        checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));
        checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
        
        resourceDesc.resType = cudaResourceTypeArray;
        resourceDesc.res.array.array = cudaArray;

        textureDesc.normalizedCoords = true;
        textureDesc.filterMode = cudaFilterModePoint;
        textureDesc.addressMode[0] = cudaAddressModeWrap;
        textureDesc.addressMode[1] = cudaAddressModeWrap;
        textureDesc.readMode = cudaReadModeElementType;
        checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));
        STBI_FREE(pHostData);
    }

设备端代码:

class imageTexture {
public:
    __device__ imageTexture(cudaTextureObject_t tex) :_texture(tex) {}
    __device__ virtual vec3 value(float u, float v, const vec3& p) const {
        vec3 value;
        u *= 3;
        value.x = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
        u++;
        value.y = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
        u++;
        value.z = tex2D<unsigned char>(_texture, u, v) * 1.0f / 255.0f;
        return value;
    }
private:
    cudaTextureObject_t _texture;
};

更改了设备端功能,但错误仍然存​​在:

class imageTexture :public textureX {
public:
    __device__ imageTexture(cudaTextureObject_t tex) :_text(tex) {}
    __device__ virtual vec3 value(float u, float v, const vec3& p) const override {
        vec3 val;
        u *= 3;
        val.x = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
        u++;
        val.y = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
        u++;
        val.z = tex2D<unsigned char>(_text, u, v) * 1.0f / 255.0f;
        return val;
    }
private:
    cudaTextureObject_t _text;
};

我已经编写了一个新的测试程序并计划以此为基础进行构建。这个想法是让每个线程从纹理中读取 3 个值并将其写回缓冲区。只有第一个三连音是正确的。我的纹理查找中是否有任何不一致的地方:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>

#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>

#include <stdio.h>

#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )
void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
    if (result) {
        std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
        std::cerr << cudaGetErrorString(result) << std::endl;
        // Make sure we call CUDA Device Reset before exiting
        cudaDeviceReset();
        exit(99);
    }
}

__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
    uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
    if ((x < width) && (y < height)) {
        float u = (float)x / (float)width;
        float v = (float)y / (float)height;
        pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, 3*u, v);
        pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, 3*u + 1, v);
        pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, 3*u + 2, v);
    }
}

void cudaTex() {
    const uint32_t bytesPerPixel{ 3u };
    const uint32_t textureWidth = 1024u;
    const uint32_t textureHeight = 512u;
    uint32_t bytesPerScanline;

    bytesPerScanline = bytesPerPixel * textureWidth;

    cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
    cudaArray* cudaArray;
    checkCudaErrors(cudaMallocArray(&cudaArray, &channelFormatDesc, bytesPerScanline, textureHeight));

    uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
    std::srand(std::time(nullptr));
    for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
        pHostData[idx] = std::rand();
    checkCudaErrors(cudaMemcpyToArray(cudaArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));

    cudaResourceDesc resourceDesc{};
    resourceDesc.resType = cudaResourceTypeArray;
    resourceDesc.res.array.array = cudaArray;

    cudaTextureDesc textureDesc{};
    textureDesc.normalizedCoords = false;
    textureDesc.filterMode = cudaFilterModePoint;
    textureDesc.addressMode[0] = cudaAddressModeWrap;
    textureDesc.addressMode[1] = cudaAddressModeWrap;
    textureDesc.readMode = cudaReadModeElementType;

    cudaTextureObject_t textureObject{};
    checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));

    dim3 dimBlock(8u, 8u, 1u);
    dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
    uint8_t* dOutput{ nullptr };
    checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
    texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());
    uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
    checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));

    for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
        for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
            if (hOutput[jdx] != pHostData[jdx])
                std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
        }
        hOutput += bytesPerScanline;
        pHostData += bytesPerScanline;
    }

    checkCudaErrors(cudaDestroyTextureObject(textureObject));
    checkCudaErrors(cudaFree(dOutput));
    checkCudaErrors(cudaFreeArray(cudaArray));

    delete[] hOutput;
    delete[] pHostData;
}

int main() {
    cudaTex();
    return 0;
}

切换到在内核中协调的整数解决了问题

原问题的解决

原来

a pointer to a bound function may only be used to call the function

错误是由 vec3 class 具有 getter 函数 x() 而不是名为 x 的成员变量引起的。所以代码尝试使用 getter 函数作为 l-value!!!

您现在发布的代码存在几个问题:

  1. 经过评论区的讨论,希望大家能搞清楚这行代码有什么问题:

    cudaArray* cudaArray;
    
  2. 您的内核代码似乎在尝试传递规范化的 float 坐标,但操作不正确。这里有几个问题:你的 x 标准化正在考虑 textureWidth 但它应该在 3*textureWidth 上完成(即 bytesPerScanline)。尽管您称纹理的宽度为 textureWidth,但实际上它是 3*textureWidth。此外,这种方式的纹理通常偏移 0.5。最后,你这样做:

    textureDesc.normalizedCoords = false;
    

    但是如果你想使用 float 坐标(似乎是你想要的)你应该这样做:

    textureDesc.normalizedCoords = true;
    
  3. 在您解决所有这些问题后,您将 运行 陷入非 CUDA 问题。您正在修改这些指针:

     hOutput += bytesPerScanline;
     pHostData += bytesPerScanline;
    

    修改后尝试删除:

    delete[] hOutput;
    delete[] pHostData;
    

    这将无法正常工作。

这是解决了上述问题的修改代码,对我来说似乎 运行 正确:

$ cat t7.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>

#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
#include <ctime>
#include <cstdint>

#include <stdio.h>

#define checkCudaErrors(val) check_cuda( (val), #val, __FILE__, __LINE__ )

void check_cuda(cudaError_t result, char const* const func, const char* const file, int const line) {
    if (result) {
        std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at " << file << ":" << line << " " << func << std::endl;
        std::cerr << cudaGetErrorString(result) << std::endl;
        // Make sure we call CUDA Device Reset before exiting
        cudaDeviceReset();
        exit(99);
    }
}

__global__ void texCheck(uint32_t width, uint32_t height, uint8_t* pOutput, cudaTextureObject_t textureObject) {
    uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
    uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
    const float pix_offset = 0.5;
    if ((x < width) && (y < height)) {
        float u = (float)(3*x+pix_offset) / (float)(3*width);
        float v = (float)y / (float)height;
        pOutput[y * (3 * width) + (3 * x)] = tex2D<uint8_t>(textureObject, u, v);
        u = (float)(3*x+1+pix_offset) / (float)(3*width);
        pOutput[y * (3 * width) + (3 * x) + 1] = tex2D<uint8_t>(textureObject, u, v);
        u = (float)(3*x+2+pix_offset) / (float)(3*width);
        pOutput[y * (3 * width) + (3 * x) + 2] = tex2D<uint8_t>(textureObject, u, v);
    }
}

void cudaTex() {
    const uint32_t bytesPerPixel{ 3u };
    const uint32_t textureWidth = 1024u;
    const uint32_t textureHeight = 512u;
    uint32_t bytesPerScanline;

    bytesPerScanline = bytesPerPixel * textureWidth;

    cudaChannelFormatDesc channelFormatDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
    cudaArray* cArray;
    checkCudaErrors(cudaMallocArray(&cArray, &channelFormatDesc, bytesPerScanline, textureHeight));
    uint8_t* pHostData = new uint8_t[bytesPerScanline * textureHeight];
    std::srand(std::time(nullptr));
    for (uint64_t idx = 0ull; idx < bytesPerScanline * textureHeight; idx++)
        pHostData[idx] = std::rand();
    checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));

    cudaResourceDesc resourceDesc{};
    resourceDesc.resType = cudaResourceTypeArray;
    resourceDesc.res.array.array = cArray;

    cudaTextureDesc textureDesc{};
    textureDesc.normalizedCoords = true;
    textureDesc.filterMode = cudaFilterModePoint;
    textureDesc.addressMode[0] = cudaAddressModeWrap;
    textureDesc.addressMode[1] = cudaAddressModeWrap;
    textureDesc.readMode = cudaReadModeElementType;

    cudaTextureObject_t textureObject{};
    checkCudaErrors(cudaCreateTextureObject(&textureObject, &resourceDesc, &textureDesc, nullptr));

    dim3 dimBlock(8u, 8u, 1u);
    dim3 dimGrid(textureWidth / dimBlock.x, textureHeight / dimBlock.y, 1u);
    uint8_t* dOutput{ nullptr };
    checkCudaErrors(cudaMalloc((void**)&dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t)));
    texCheck << < dimGrid, dimBlock >> > (textureWidth, textureHeight, dOutput, textureObject);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());
    uint8_t* hOutput = new uint8_t[bytesPerScanline * textureHeight];
    checkCudaErrors(cudaMemcpy(hOutput, dOutput, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyDeviceToHost));
    uint8_t *my_hOutput = hOutput;
    uint8_t *my_pHostData = pHostData;
    for (uint64_t idx = 0ull; idx < textureHeight; idx++) {
        for (uint64_t jdx = 0ull; jdx < bytesPerScanline; jdx++) {
            if (hOutput[jdx] != pHostData[jdx]){
                std::cerr << "Mismatch @ " << idx << " " << jdx << " Expected " << (uint32_t)pHostData[jdx] << " Received " << (uint32_t)hOutput[jdx] << std::endl;
            return;}
        }
        hOutput += bytesPerScanline;
        pHostData += bytesPerScanline;
    }
    checkCudaErrors(cudaDestroyTextureObject(textureObject));
    checkCudaErrors(cudaFree(dOutput));
    checkCudaErrors(cudaFreeArray(cArray));

    delete[] my_hOutput;
    delete[] my_pHostData;
}

int main() {
    cudaTex();
    return 0;
}
$ nvcc -o t7 t7.cu -std=c++11
t7.cu: In function ‘void cudaTex()’:
t7.cu:56:12: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
     checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
            ^
t7.cu:56:131: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated (declared at /usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6782) [-Wdeprecated-declarations]
     checkCudaErrors(cudaMemcpyToArray(cArray, 0, 0, pHostData, bytesPerScanline * textureHeight * sizeof(uint8_t), cudaMemcpyHostToDevice));
                                                                                                                                   ^
$ cuda-memcheck ./t7
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

我并不是说上面的代码没有缺陷。这主要是你的代码。我只是指出一些我发现的东西。您可以阅读有关如何解决弃用警告的信息 here