NPP 图像互相关未产生有效结果
NPP image cross correlation producing no valid results
我正在尝试使用 NPP 互相关库实现图像置换方法。
我尝试通过在内存中生成几个简单的图像来创建一个简单的解决方案,输出 Npp8u 或 Npp32f 数组。然而,互相关库产生无意义或无效(即 NAN)结果。
int main(int argc, char* argv[])
{
Npp8u* gpuImg1, * gpuImg2;
Npp32f *gpuDest;
cudaDeviceInit(argc, (const char**)argv);
long dataSize1 = 128;
auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
auto img1Data = static_cast<unsigned char*>(malloc(dataSize1 * dataSize1 * sizeof(unsigned char)));
memset(img1Data, 0, dataSize1 * dataSize1);
for(auto y = 40; y < 60; y++)
{
for(auto x = 20; x < 40; x++)
{
img1Data[y * dataSize1 + x] = 0xff;
}
}
long dataSize2 = 64;
err = cudaMalloc((void**)&gpuImg2, dataSize2);
auto img2data = static_cast<unsigned char*>(malloc(dataSize2 * dataSize2 * sizeof(unsigned char)));
memset(img2data, 0, dataSize2 * dataSize2);
for (auto y = 10; y < 30; y++)
{
for (auto x = 20; x < 40; x++)
{
img2data[y * dataSize2 + x] = 0xff;
}
}
auto resSize = (dataSize1 - dataSize2) + 1;
err = cudaMalloc((void**)&gpuDest, resSize * resSize * sizeof(Npp32f));
auto resData = static_cast<Npp32f*>(malloc(resSize * resSize * sizeof(Npp32f)));
NppiSize nppiSize1;
nppiSize1.height = dataSize1;
nppiSize1.width = dataSize1;
NppiSize nppiSize2;
nppiSize2.height = dataSize2;
nppiSize2.width = dataSize2;
err = cudaMemcpy(gpuImg1, img1Data, dataSize1, cudaMemcpyHostToDevice);
err = cudaMemcpy(gpuImg2, img2data, dataSize2, cudaMemcpyHostToDevice);
auto status = nppiCrossCorrValid_Norm_8u32f_C1R(gpuImg1, dataSize1, nppiSize1, gpuImg2, dataSize2, nppiSize2, gpuDest, resSize * sizeof(Npp32f));
err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp8u), cudaMemcpyDeviceToHost);
}
CUDA 方法和 NPP 方法都返回成功消息,所以我很确定这是我在设置互相关时做错了什么。任何人都可以帮助我朝着正确的方向寻求解决方案吗?
您的代码似乎有两类问题。
首先,您在数据大小方面存在各种问题。不确定这里的断开连接在哪里,因为其中一些你是对的,所以我只指出我看到的东西。
这个构造是正确的:
auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
你应该在这里做同样的事情:
err = cudaMalloc((void**)&gpuImg2, dataSize2);
这些都不正确。 cudaMemcpy
、like memcpy
和 like cudaMalloc
接受一个以字节为单位的大小参数:
err = cudaMemcpy(gpuImg1, img1Data, dataSize1, cudaMemcpyHostToDevice);
err = cudaMemcpy(gpuImg2, img2data, dataSize2, cudaMemcpyHostToDevice);
^^^^^^^^^
你在从设备到主机的后续副本中几乎是正确的,只是你的 sizeof
应该是正确的类型:
err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp8u), cudaMemcpyDeviceToHost);
^^^^^
其次,您使用的是互相关的归一化版本。如果你研究一下documentation,我相信你会发现当你的图像的大部分是zero-valued时,分母可以计算为零的square-root。无论如何,当我将 "background" 从 0 转换为 1 时,我得到了合理的结果。另一种选择是切换到函数 (nppiCrossCorrValid_8u32f_C1R
) 的 non-normalized 版本,它也会产生 non-NAN 结果,即使有大面积的零 "background".
这是更正后的版本,我想它会给你 non-NAN 结果:
# cat t14.cu
#include <npp.h>
#include <iostream>
int main(int argc, char* argv[])
{
Npp8u* gpuImg1, * gpuImg2;
Npp32f *gpuDest;
// cudaDeviceInit(argc, (const char**)argv);
long dataSize1 = 128;
auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
unsigned char *img1Data = static_cast<unsigned char*>(malloc(dataSize1 * dataSize1 * sizeof(unsigned char)));
memset(img1Data, 1, dataSize1 * dataSize1);
for(auto y = 40; y < 60; y++)
{
for(auto x = 20; x < 40; x++)
{
img1Data[y * dataSize1 + x] = 0xff;
}
}
long dataSize2 = 64;
err = cudaMalloc((void**)&gpuImg2, dataSize2*dataSize2 *sizeof(unsigned char));
unsigned char *img2data = static_cast<unsigned char*>(malloc(dataSize2 * dataSize2 * sizeof(unsigned char)));
memset(img2data, 1, dataSize2 * dataSize2);
for (auto y = 10; y < 30; y++)
{
for (auto x = 20; x < 40; x++)
{
img2data[y * dataSize2 + x] = 0xff;
}
}
auto resSize = (dataSize1 - dataSize2) + 1;
err = cudaMalloc((void**)&gpuDest, resSize * resSize * sizeof(Npp32f));
auto resData = static_cast<Npp32f*>(malloc(resSize * resSize * sizeof(Npp32f)));
NppiSize nppiSize1;
nppiSize1.height = dataSize1;
nppiSize1.width = dataSize1;
NppiSize nppiSize2;
nppiSize2.height = dataSize2;
nppiSize2.width = dataSize2;
err = cudaMemcpy(gpuImg1, img1Data, dataSize1*dataSize1*sizeof(unsigned char), cudaMemcpyHostToDevice);
err = cudaMemcpy(gpuImg2, img2data, dataSize2*dataSize2*sizeof(unsigned char), cudaMemcpyHostToDevice);
auto status = nppiCrossCorrValid_Norm_8u32f_C1R(gpuImg1, dataSize1, nppiSize1, gpuImg2, dataSize2, nppiSize2, gpuDest, resSize * sizeof(Npp32f));
err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp32f), cudaMemcpyDeviceToHost);
for (int i = 0; i < resSize*2; i++)
std::cout << resData[i] << ",";
std::cout << std::endl;
}
# nvcc -std=c++11 -o t14 t14.cu -lnppc -lnppist
# cuda-memcheck ./t14
========= CUDA-MEMCHECK
0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00797587,0.00798853,0.00800826,0.00803633,0.00807432,0.00812423,0.00818861,0.00827071,0.00837505,0.00850754,0.00867648,0.00889385,0.00917761,0.00955609,0.0100771,0.0108291,0.0119988,0.0140744,0.0190166,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796926,0.00796926,0.00796926,0.00796926,0.00797588,0.00798854,0.00800827,0.00803634,0.00807434,0.00812425,0.00818863,0.00827071,0.00837505,0.00850754,0.00867648,0.00889385,0.00917761,0.00955609,0.0100771,0.0108291,0.0119988,0.0140744,0.0190166,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,
========= ERROR SUMMARY: 0 errors
#
我正在尝试使用 NPP 互相关库实现图像置换方法。
我尝试通过在内存中生成几个简单的图像来创建一个简单的解决方案,输出 Npp8u 或 Npp32f 数组。然而,互相关库产生无意义或无效(即 NAN)结果。
int main(int argc, char* argv[])
{
Npp8u* gpuImg1, * gpuImg2;
Npp32f *gpuDest;
cudaDeviceInit(argc, (const char**)argv);
long dataSize1 = 128;
auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
auto img1Data = static_cast<unsigned char*>(malloc(dataSize1 * dataSize1 * sizeof(unsigned char)));
memset(img1Data, 0, dataSize1 * dataSize1);
for(auto y = 40; y < 60; y++)
{
for(auto x = 20; x < 40; x++)
{
img1Data[y * dataSize1 + x] = 0xff;
}
}
long dataSize2 = 64;
err = cudaMalloc((void**)&gpuImg2, dataSize2);
auto img2data = static_cast<unsigned char*>(malloc(dataSize2 * dataSize2 * sizeof(unsigned char)));
memset(img2data, 0, dataSize2 * dataSize2);
for (auto y = 10; y < 30; y++)
{
for (auto x = 20; x < 40; x++)
{
img2data[y * dataSize2 + x] = 0xff;
}
}
auto resSize = (dataSize1 - dataSize2) + 1;
err = cudaMalloc((void**)&gpuDest, resSize * resSize * sizeof(Npp32f));
auto resData = static_cast<Npp32f*>(malloc(resSize * resSize * sizeof(Npp32f)));
NppiSize nppiSize1;
nppiSize1.height = dataSize1;
nppiSize1.width = dataSize1;
NppiSize nppiSize2;
nppiSize2.height = dataSize2;
nppiSize2.width = dataSize2;
err = cudaMemcpy(gpuImg1, img1Data, dataSize1, cudaMemcpyHostToDevice);
err = cudaMemcpy(gpuImg2, img2data, dataSize2, cudaMemcpyHostToDevice);
auto status = nppiCrossCorrValid_Norm_8u32f_C1R(gpuImg1, dataSize1, nppiSize1, gpuImg2, dataSize2, nppiSize2, gpuDest, resSize * sizeof(Npp32f));
err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp8u), cudaMemcpyDeviceToHost);
}
CUDA 方法和 NPP 方法都返回成功消息,所以我很确定这是我在设置互相关时做错了什么。任何人都可以帮助我朝着正确的方向寻求解决方案吗?
您的代码似乎有两类问题。
首先,您在数据大小方面存在各种问题。不确定这里的断开连接在哪里,因为其中一些你是对的,所以我只指出我看到的东西。
这个构造是正确的:
auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
你应该在这里做同样的事情:
err = cudaMalloc((void**)&gpuImg2, dataSize2);
这些都不正确。 cudaMemcpy
、like memcpy
和 like cudaMalloc
接受一个以字节为单位的大小参数:
err = cudaMemcpy(gpuImg1, img1Data, dataSize1, cudaMemcpyHostToDevice);
err = cudaMemcpy(gpuImg2, img2data, dataSize2, cudaMemcpyHostToDevice);
^^^^^^^^^
你在从设备到主机的后续副本中几乎是正确的,只是你的 sizeof
应该是正确的类型:
err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp8u), cudaMemcpyDeviceToHost);
^^^^^
其次,您使用的是互相关的归一化版本。如果你研究一下documentation,我相信你会发现当你的图像的大部分是zero-valued时,分母可以计算为零的square-root。无论如何,当我将 "background" 从 0 转换为 1 时,我得到了合理的结果。另一种选择是切换到函数 (nppiCrossCorrValid_8u32f_C1R
) 的 non-normalized 版本,它也会产生 non-NAN 结果,即使有大面积的零 "background".
这是更正后的版本,我想它会给你 non-NAN 结果:
# cat t14.cu
#include <npp.h>
#include <iostream>
int main(int argc, char* argv[])
{
Npp8u* gpuImg1, * gpuImg2;
Npp32f *gpuDest;
// cudaDeviceInit(argc, (const char**)argv);
long dataSize1 = 128;
auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
unsigned char *img1Data = static_cast<unsigned char*>(malloc(dataSize1 * dataSize1 * sizeof(unsigned char)));
memset(img1Data, 1, dataSize1 * dataSize1);
for(auto y = 40; y < 60; y++)
{
for(auto x = 20; x < 40; x++)
{
img1Data[y * dataSize1 + x] = 0xff;
}
}
long dataSize2 = 64;
err = cudaMalloc((void**)&gpuImg2, dataSize2*dataSize2 *sizeof(unsigned char));
unsigned char *img2data = static_cast<unsigned char*>(malloc(dataSize2 * dataSize2 * sizeof(unsigned char)));
memset(img2data, 1, dataSize2 * dataSize2);
for (auto y = 10; y < 30; y++)
{
for (auto x = 20; x < 40; x++)
{
img2data[y * dataSize2 + x] = 0xff;
}
}
auto resSize = (dataSize1 - dataSize2) + 1;
err = cudaMalloc((void**)&gpuDest, resSize * resSize * sizeof(Npp32f));
auto resData = static_cast<Npp32f*>(malloc(resSize * resSize * sizeof(Npp32f)));
NppiSize nppiSize1;
nppiSize1.height = dataSize1;
nppiSize1.width = dataSize1;
NppiSize nppiSize2;
nppiSize2.height = dataSize2;
nppiSize2.width = dataSize2;
err = cudaMemcpy(gpuImg1, img1Data, dataSize1*dataSize1*sizeof(unsigned char), cudaMemcpyHostToDevice);
err = cudaMemcpy(gpuImg2, img2data, dataSize2*dataSize2*sizeof(unsigned char), cudaMemcpyHostToDevice);
auto status = nppiCrossCorrValid_Norm_8u32f_C1R(gpuImg1, dataSize1, nppiSize1, gpuImg2, dataSize2, nppiSize2, gpuDest, resSize * sizeof(Npp32f));
err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp32f), cudaMemcpyDeviceToHost);
for (int i = 0; i < resSize*2; i++)
std::cout << resData[i] << ",";
std::cout << std::endl;
}
# nvcc -std=c++11 -o t14 t14.cu -lnppc -lnppist
# cuda-memcheck ./t14
========= CUDA-MEMCHECK
0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00797587,0.00798853,0.00800826,0.00803633,0.00807432,0.00812423,0.00818861,0.00827071,0.00837505,0.00850754,0.00867648,0.00889385,0.00917761,0.00955609,0.0100771,0.0108291,0.0119988,0.0140744,0.0190166,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796926,0.00796926,0.00796926,0.00796926,0.00797588,0.00798854,0.00800827,0.00803634,0.00807434,0.00812425,0.00818863,0.00827071,0.00837505,0.00850754,0.00867648,0.00889385,0.00917761,0.00955609,0.0100771,0.0108291,0.0119988,0.0140744,0.0190166,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,
========= ERROR SUMMARY: 0 errors
#