用于 16 位图像的 CUDA NPP 中值滤波器

CUDA NPP Median Filter for 16 bit images

最终更新:已解决。 WDDM 超时也是一个问题。从以下位置找到解决方案:WDDM timeout fix。谢谢罗伯特。

更新:感谢罗伯特指出过滤器的中心不是 0,0。不幸的是,如果过滤器增加,比如增加到 17x17,您发布的代码对我来说会中断。这可能是因为您没有考虑图像 "side" 上的边框。无论如何,这是最新的代码,但仍然表现出与以前相同的问题...

//npp
#include "npp.h"
#include "nppi.h"
#include "device_launch_parameters.h"

#include <iostream>

int main() {

    //Image size.
    int imageWidth = 6592; 
    int imageHeight = 4400;

    //Misc.
    int bytesPerPixel = 2;
    int totalPixels = imageWidth*imageHeight;
    int filterSize = 17;
    int halfFilter = filterSize/2;
    cudaError success2;
    NppStatus success1;

    //Mask & Origin for CUDA.
    NppiSize cudaMask; 
    cudaMask.height = filterSize; 
    cudaMask.width = filterSize;
    NppiPoint cudaAnchor;
    cudaAnchor.x = halfFilter;
    cudaAnchor.y = halfFilter;

    //ROI for CUDA.
    int left = halfFilter;
    int right = (imageWidth-1) - halfFilter;
    int top = halfFilter;
    int bot = (imageHeight-1) - halfFilter;
    NppiSize cudaROI;
    cudaROI.height  = bot - top;
    cudaROI.width   = right - left;

    //Step size.
    int step = imageWidth * bytesPerPixel;

    //Create a new "image".
    unsigned short* image = new unsigned short[totalPixels];
    for(int i=0; i<imageWidth; i++)
        for(int j=0; j<imageHeight; j++)
            image[j*imageWidth+i] = 10;

    //Allocate mem on device.
    Npp16u *dSrc, *dDst;
    Npp8u *dBuf;
    Npp32u bufferSize;

    //This call always returns a bufferSize==0.  That doesn't seem right...
    success1 = nppiFilterMedianGetBufferSize_16u_C1R(cudaROI, cudaMask, &bufferSize);
    std::cout << "get bufferSize returned: " << (int)success1 << std::endl;
    std::cout << bufferSize << std::endl;
    success2 = cudaMalloc( (void**)&dBuf, bufferSize);
    std::cout << "cudaMalloc 1 returned: " << (int)success2 << std::endl;
    success2 = cudaMalloc( (void**)&dSrc, totalPixels*sizeof(Npp16u));
    std::cout << "cudaMalloc 2 returned: " << (int)success2 << std::endl;
    success2 = cudaMalloc( (void**)&dDst, totalPixels*sizeof(Npp16u));
    std::cout << "cudaMalloc 3 returned: " << (int)success2 << std::endl;

    //Copy host image to device.
    success2 = cudaMemcpy( dSrc, image, totalPixels*sizeof(Npp16u), cudaMemcpyHostToDevice);
    std::cout << "cudaMemcpy 1 returned: " << (int)success2 << std::endl;


    //Copy source to destination.
    success1 = nppiCopy_16u_C1R( dSrc, step, dDst, step, cudaROI);
    std::cout << "npp Copy 1 returned: " << (int)success1 << std::endl;


    //Filter.
    Npp32u offset = top*step + left*bytesPerPixel;
    success1 = nppiFilterMedian_16u_C1R(    dSrc + offset,
                                            step,
                                            dDst + offset,
                                            step,
                                            cudaROI, cudaMask, cudaAnchor, dBuf);
    std::cout << "npp Filter  returned: " << (int)success1 << std::endl;


    //Copy resultant back to host.
    success2 = cudaMemcpy( image, dDst, totalPixels*sizeof(Npp16u), cudaMemcpyDeviceToHost);
    std::cout << "cudaMemcpy 2 returned: " << (int)success2 << std::endl;

    //Clean.
    success2 = cudaFree(dDst);
    success2 = cudaFree(dBuf);
    success2 = cudaFree(dSrc);
    delete image;

    system("pause");
    return 0;

}

我正在尝试计算 29mp 图像的中值滤波器。过滤器尺寸为 13x13。图像的宽度和高度如下所示。 由于未知原因,以下代码会崩溃,我想问问有没有人知道为什么?

我注意到的奇怪事情:

  1. 错误发生在nppiFilterMedian_16u_C1R()。函数本身 return 没有错误条件,但下面的 cudaMemcpy() 会。没有过滤器,cudaMemcpy() 工作得很好。

  2. 此外,获取 16 位过滤器的缓冲区大小总是 return 大小为 0。我测试了 8 位和 32 位,return 非零值...

  3. 我认为这可能是 NPPI 库的错误 (?)。 它似乎与大小有关(如果你使用缩小图像的 width/height 它对于 13x13 的过滤器大小来说效果很好)。 但是,我的过滤器大小需要达到31x31.

其他重要信息: Windows x64 应用程序,CUDA 运行时 7.5,NPP 版本 7.5。 GPU 设备是 Quadro k2200(4GB 全局内存)。

中值滤波器函数将在图像上逐点传递一个掩码。此掩码具有指定的尺寸(原始代码中为 9x9)。锚点将决定如何为每个像素定位此蒙版。当锚点为0,0时,mask会这样定位:

p**
***
***

其中p表示像素位置,mask大小为3x3。对于 1,1 的锚点,每个像素的蒙版定位为:

***
*p*
***

因此我们看到锚点和掩码大小将确定每个像素周围的某个边界,中值滤波器函数必须可以访问该边界。在处理图像边界中的像素时,我们必须确保该边界落在有效像素上。

你开始的情况,一个 9x9 掩码和 0,0 锚点,意味着我们只需要 "extra" 个像素作为图像 "end" 处的边界。因此修改很简单:限制 ROI 高度,使其不处理图像的最后几行,对应于 mask 维度。对于这种情况,我们可以简单地从 ROI 高度中减去 10,错误就会消失:

$ cat t1223.cu
//npp
#include "npp.h"
#include "nppi.h"
#include <iostream>

int main() {

//When the filter size is 9x9....
int imageWidth = 6592; //breaks if > 5914 && imageHeight = 4400
int imageHeight = 4400; //breaks if > 3946 && imageWidth = 6592

//Misc.
int bytesPerPixel = 2;
int totalPixels = imageWidth*imageHeight;
cudaError success2;
NppStatus success1;

//ROI for CUDA.
NppiSize cudaROI;
cudaROI.height  = imageHeight-10;
cudaROI.width   = imageWidth;

//Mask & Origin for CUDA.
NppiSize cudaMask; NppiPoint cudaAnchor;
cudaMask.height = 9; //filter size
cudaMask.width = 9;
cudaAnchor.x = 0;
cudaAnchor.y = 0;

//Step size.
int step = imageWidth * bytesPerPixel;

//Create a new "image".
unsigned short* image = new unsigned short[totalPixels];
for(int i=0; i<imageWidth; i++)
    for(int j=0; j<imageHeight; j++)
        image[j*imageWidth+i] = 10;


//Allocate mem on device.
Npp16u *dSrc, *dDst;
Npp8u *dBuf;
Npp32u bufferSize;

//This call always returns a bufferSize==0.  That doesn't seem right...
success1 = nppiFilterMedianGetBufferSize_16u_C1R(cudaROI, cudaMask, &bufferSize);
std::cout << "get bufferSize returned: " << (int)success1 << std::endl;
std::cout << bufferSize << std::endl;
success2 = cudaMalloc( (void**)&dBuf, bufferSize);
std::cout << "cudaMalloc 1 returned: " << (int)success2 << std::endl;
success2 = cudaMalloc( (void**)&dSrc, totalPixels*sizeof(Npp16u));
std::cout << "cudaMalloc 2 returned: " << (int)success2 << std::endl;
success2 = cudaMalloc( (void**)&dDst, totalPixels*sizeof(Npp16u));
std::cout << "cudaMalloc 3 returned: " << (int)success2 << std::endl;

//Copy host image to device.
success2 = cudaMemcpy( dSrc, image, totalPixels*sizeof(Npp16u), cudaMemcpyHostToDevice);
std::cout << "cudaMemcpy 1 returned: " << (int)success2 << std::endl;

//Copy source to destination.
success1 = nppiCopy_16u_C1R( dSrc, step, dDst, step, cudaROI);
std::cout << "npp Copy 1 returned: " << (int)success1 << std::endl;

//Filter.
success1 = nppiFilterMedian_16u_C1R(dSrc,
                                    step,
                                    dDst,
                                    step,
                                    cudaROI, cudaMask, cudaAnchor, dBuf);
std::cout << "npp Filter  returned: " << (int)success1 << std::endl;

//Copy resultant back to host.
success2 = cudaMemcpy( image, dDst, totalPixels*sizeof(Npp16u), cudaMemcpyDeviceToHost);
std::cout << "cudaMemcpy 2 returned: " << (int)success2 << std::endl;

//Clean.
success2 = cudaFree(dBuf);
success2 = cudaFree(dSrc);
success2 = cudaFree(dDst);
delete image;

return 0;
}
$ nvcc -arch=sm_35 -o t1223 t1223.cu -lnppi
$ cuda-memcheck ./t1223
========= CUDA-MEMCHECK
get bufferSize returned: 0
0
cudaMalloc 1 returned: 0
cudaMalloc 2 returned: 0
cudaMalloc 3 returned: 0
cudaMemcpy 1 returned: 0
npp Copy 1 returned: 0
npp Filter  returned: 0
cudaMemcpy 2 returned: 0
========= ERROR SUMMARY: 0 errors
$

请注意,如果锚点已移动(例如,移动到 4,4 而不是上述情况中的 0,0),则这意味着 "boundary" 像素需要可用 ~图片开始前 5 行 。我们可以通过正确设置 ROI 并偏移处理的开始来解决这个问题,方法是向传递给中值滤波器的源指针添加行偏移,如下所示:

success1 = nppiFilterMedian_16u_C1R(dSrc + 5*imageWidth,

请注意,我并不想在这里提供有关中值滤波的完整教程,只是试图找出导致实际功能失败的问题。您可能还需要考虑左侧和右侧过滤掩码边界。在图像边界的左侧和右侧,那些像素蒙版边界可能会索引到上一个或下一个图像行,因此 "wrapping" 图像,可能在过滤后的像素中具有奇怪的效果。

编辑:响应新代码发布,现在的主要问题似乎是您不了解如何偏移图像。

在C/C++中,如果我有一个指针,并且我想将该指针偏移一定数量的元素,我只需添加我想要偏移的元素数量。无需按字节缩放。如果你研究过我之前在上面给出的偏移示例,你会注意到没有按字节缩放任何东西。如果我们想偏移 5 行,它只是 5 乘以图像宽度,如上所示。

此外,您使用cudaROI 来通知您的src->dst 复制操作,这对我来说没有意义,所以我修改了它。最后,我修改了代码,使其可以使用角落的锚点或中心的锚点来构建。

这是对你的代码的修改,在两种锚点情况下都能为我正确编译和运行:

$ cat t1225.cu
//npp
#include "npp.h"
#include "nppi.h"
#include "device_launch_parameters.h"

#include <iostream>

int main() {

    //Image size.
    int imageWidth = 6592;
    int imageHeight = 4400;

    //Misc.
    int bytesPerPixel = 2;
    int totalPixels = imageWidth*imageHeight;
    int filterSize = 17;
    int halfFilter = filterSize/2;
    cudaError success2;
    NppStatus success1;

    //Mask & Origin for CUDA.
    NppiSize cudaMask;
    cudaMask.height = filterSize;
    cudaMask.width = filterSize;
    NppiPoint cudaAnchor;
#ifndef ANCHOR_CORNER
    cudaAnchor.x = halfFilter;
    cudaAnchor.y = halfFilter;
#else
    cudaAnchor.x = 0;
    cudaAnchor.y = 0;
#endif
    NppiSize imgSize;
    imgSize.width = imageWidth;
    imgSize.height = imageHeight;

    //ROI for CUDA.
    int left = halfFilter;
    int right = (imageWidth-1) - halfFilter;
    int top = halfFilter;
    int bot = (imageHeight-1) - halfFilter;
    NppiSize cudaROI;
    cudaROI.height  = bot - top;
    cudaROI.width   = right - left;

    //Step size.
    int step = imageWidth * bytesPerPixel;

    //Create a new "image".
    unsigned short* image = new unsigned short[totalPixels];
    for(int i=0; i<imageWidth; i++)
        for(int j=0; j<imageHeight; j++)
            image[j*imageWidth+i] = 10;

    //Allocate mem on device.
    Npp16u *dSrc, *dDst;
    Npp8u *dBuf;
    Npp32u bufferSize;

    //This call always returns a bufferSize==0.  That doesn't seem right...
    success1 = nppiFilterMedianGetBufferSize_16u_C1R(cudaROI, cudaMask, &bufferSize);
    std::cout << "get bufferSize returned: " << (int)success1 << std::endl;
    std::cout << bufferSize << std::endl;
    success2 = cudaMalloc( (void**)&dBuf, bufferSize);
    std::cout << "cudaMalloc 1 returned: " << (int)success2 << std::endl;
    success2 = cudaMalloc( (void**)&dSrc, totalPixels*sizeof(Npp16u));
    std::cout << "cudaMalloc 2 returned: " << (int)success2 << std::endl;
    success2 = cudaMalloc( (void**)&dDst, totalPixels*sizeof(Npp16u));
    std::cout << "cudaMalloc 3 returned: " << (int)success2 << std::endl;

    //Copy host image to device.
    success2 = cudaMemcpy( dSrc, image, totalPixels*sizeof(Npp16u), cudaMemcpyHostToDevice);
    std::cout << "cudaMemcpy 1 returned: " << (int)success2 << std::endl;


    //Copy source to destination.
    success1 = nppiCopy_16u_C1R( dSrc, step, dDst, step, imgSize);
    std::cout << "npp Copy 1 returned: " << (int)success1 << std::endl;


    //Filter.
#ifndef ANCHOR_CORNER
    Npp32u offset = top*imageWidth + left;
#else
    Npp32u offset = 0;
#endif
    success1 = nppiFilterMedian_16u_C1R(    dSrc + offset,
                                            step,
                                            dDst + offset,
                                            step,
                                            cudaROI, cudaMask, cudaAnchor, dBuf);
    std::cout << "npp Filter  returned: " << (int)success1 << std::endl;


    //Copy resultant back to host.
    success2 = cudaMemcpy( image, dDst, totalPixels*sizeof(Npp16u), cudaMemcpyDeviceToHost);
    std::cout << "cudaMemcpy 2 returned: " << (int)success2 << std::endl;

    //Clean.
    success2 = cudaFree(dDst);
    success2 = cudaFree(dBuf);
    success2 = cudaFree(dSrc);
    delete image;

    return 0;

}
$ nvcc -o t1225 t1225.cu -lnppi
$ cuda-memcheck ./t1225
========= CUDA-MEMCHECK
get bufferSize returned: 0
0
cudaMalloc 1 returned: 0
cudaMalloc 2 returned: 0
cudaMalloc 3 returned: 0
cudaMemcpy 1 returned: 0
npp Copy 1 returned: 0
npp Filter  returned: 0
cudaMemcpy 2 returned: 0
========= ERROR SUMMARY: 0 errors
$ nvcc -DANCHOR_CORNER -o t1225 t1225.cu -lnppi
$ cuda-memcheck ./t1225
========= CUDA-MEMCHECK
get bufferSize returned: 0
0
cudaMalloc 1 returned: 0
cudaMalloc 2 returned: 0
cudaMalloc 3 returned: 0
cudaMemcpy 1 returned: 0
npp Copy 1 returned: 0
npp Filter  returned: 0
cudaMemcpy 2 returned: 0
========= ERROR SUMMARY: 0 errors