cuda-memcheck 报告 nppiFilterGauss_8u_C1R 越界
cuda-memcheck report nppiFilterGauss_8u_C1R out of bounds
我想用nppiFilterGauss_8u_C1R,但是用cuda-memcheck时总是报越界,这是我的源码:
Npp8u* p1 = NULL;
Npp8u* p2 = NULL;
unsigned char* p3 = NULL;
unsigned char* p4 = NULL;
int step1 = 0;
int step2 = 0;
NppiSize roi;
roi.width = 352*288;
roi.height = 1;
int ret = 0;
p1 = nppiMalloc_8u_C1(352, 288, &step1);
p2 = nppiMalloc_8u_C1(352, 288, &step2);
cudaMalloc((void**)&p3, 352*288);
cudaMalloc((void**)&p4, 352*288);
printf("p1[%x],p2[%x],p3[%x],p4[%x]\n", p1, p2, p3, p4);
printf("step1[%d]\n", step1);
printf("step2[%d]\n", step2);
int count = 1;
while(count < 3) {
// ret = nppiFilterGauss_8u_C1R(p1, step1, p2, step2, roi, NPP_MASK_SIZE_3_X_3);
ret = nppiFilterGauss_8u_C1R(p3, 352*288, p4, 352*288, roi, NPP_MASK_SIZE_3_X_3);
printf("count[%d],ret[%d]\n", count, ret);
if(ret) {
break;
}
count++;
}
nppiFree(p1);
nppiFree(p2);
cudaFree(p3);
cudaFree(p4);
这里是错误:
GPU Device 0: "GK20A" with compute capability 3.2
p1[ab84a000],p2[ab86e000],p3[ab892000],p4[ab8aac00]
step1[512]
step2[512]
count[1],ret[0]
count[2],ret[0]
========= CUDA-MEMCHECK
========= Invalid __global__ read of size 1
========= at 0x00000448 in void ForEachTupleByteQuad<unsigned char, int=1, TupleByteQuadFunctor<unsigned char, int=1, FilterGauss3x3QuadNew<unsigned char, int=1>>>(Tuple8<unsigned char, int=1>*, int, NppiSize, unsigned char)
========= by thread (31,0,0) in block (395,0,0)
========= Address 0xab8c3800 is out of bounds
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= ERROR SUMMARY: 5 errors*
有人可以解释一下正确的方法吗?
应用掩码大小为 3x3 的高斯滤波器需要从当前像素读取像素 top/bottom 和 left/right。这意味着在计算像素 (0,0) 的值时,您实际上是从像素 (-1,-1) 读取的。为避免这种情况,您需要调整 ROI 或使用自动正确处理边界的 NPP 函数。
根据您的代码,这将给出如下内容:
Npp8u* p1 = NULL;
Npp8u* p2 = NULL;
int step1 = 0;
int step2 = 0;
NppiSize roi;
roi.width = 352 - 2; //roi is two pixels smaller: one pixel removed left, one on right side
roi.height = 288 - 2; //same for height
int ret = 0;
p1 = nppiMalloc_8u_C1(352, 288, &step1); //use nppiMalloc and not cudaMalloc for best performance
p2 = nppiMalloc_8u_C1(352, 288, &step2); //(NPP uses internal heuristics which step size suits best...)
printf("p1[%x],p2[%x]\n", p1, p2);
printf("step1[%d]\n", step1);
printf("step2[%d]\n", step2);
int count = 1;
while (count < 3) {
//move pointers from pixel (0,0) to pixel (1,1) = add one line step plus one, roi is two pixels smaller:
ret = nppiFilterGauss_8u_C1R(p1 + step1 + 1, step1, p2 + step2 + 1, step2, roi, NPP_MASK_SIZE_3_X_3);
printf("count[%d],ret[%d]\n", count, ret);
if (ret) {
break;
}
count++;
}
//Or use NPP function including border handling:
NppiPoint srcPoint;
srcPoint.x = 0;
srcPoint.y = 0;
roi.width = 352;
roi.height = 288;
ret = nppiFilterGaussBorder_8u_C1R(p1, step1, roi, srcPoint, p2, step2, roi, NPP_MASK_SIZE_3_X_3, NPP_BORDER_REPLICATE);
nppiFree(p1);
nppiFree(p2);
这段代码顺利通过了 CudaMemCheck。
我想用nppiFilterGauss_8u_C1R,但是用cuda-memcheck时总是报越界,这是我的源码:
Npp8u* p1 = NULL;
Npp8u* p2 = NULL;
unsigned char* p3 = NULL;
unsigned char* p4 = NULL;
int step1 = 0;
int step2 = 0;
NppiSize roi;
roi.width = 352*288;
roi.height = 1;
int ret = 0;
p1 = nppiMalloc_8u_C1(352, 288, &step1);
p2 = nppiMalloc_8u_C1(352, 288, &step2);
cudaMalloc((void**)&p3, 352*288);
cudaMalloc((void**)&p4, 352*288);
printf("p1[%x],p2[%x],p3[%x],p4[%x]\n", p1, p2, p3, p4);
printf("step1[%d]\n", step1);
printf("step2[%d]\n", step2);
int count = 1;
while(count < 3) {
// ret = nppiFilterGauss_8u_C1R(p1, step1, p2, step2, roi, NPP_MASK_SIZE_3_X_3);
ret = nppiFilterGauss_8u_C1R(p3, 352*288, p4, 352*288, roi, NPP_MASK_SIZE_3_X_3);
printf("count[%d],ret[%d]\n", count, ret);
if(ret) {
break;
}
count++;
}
nppiFree(p1);
nppiFree(p2);
cudaFree(p3);
cudaFree(p4);
这里是错误:
GPU Device 0: "GK20A" with compute capability 3.2
p1[ab84a000],p2[ab86e000],p3[ab892000],p4[ab8aac00]
step1[512]
step2[512]
count[1],ret[0]
count[2],ret[0]
========= CUDA-MEMCHECK
========= Invalid __global__ read of size 1
========= at 0x00000448 in void ForEachTupleByteQuad<unsigned char, int=1, TupleByteQuadFunctor<unsigned char, int=1, FilterGauss3x3QuadNew<unsigned char, int=1>>>(Tuple8<unsigned char, int=1>*, int, NppiSize, unsigned char)
========= by thread (31,0,0) in block (395,0,0)
========= Address 0xab8c3800 is out of bounds
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========
========= ERROR SUMMARY: 5 errors*
有人可以解释一下正确的方法吗?
应用掩码大小为 3x3 的高斯滤波器需要从当前像素读取像素 top/bottom 和 left/right。这意味着在计算像素 (0,0) 的值时,您实际上是从像素 (-1,-1) 读取的。为避免这种情况,您需要调整 ROI 或使用自动正确处理边界的 NPP 函数。
根据您的代码,这将给出如下内容:
Npp8u* p1 = NULL;
Npp8u* p2 = NULL;
int step1 = 0;
int step2 = 0;
NppiSize roi;
roi.width = 352 - 2; //roi is two pixels smaller: one pixel removed left, one on right side
roi.height = 288 - 2; //same for height
int ret = 0;
p1 = nppiMalloc_8u_C1(352, 288, &step1); //use nppiMalloc and not cudaMalloc for best performance
p2 = nppiMalloc_8u_C1(352, 288, &step2); //(NPP uses internal heuristics which step size suits best...)
printf("p1[%x],p2[%x]\n", p1, p2);
printf("step1[%d]\n", step1);
printf("step2[%d]\n", step2);
int count = 1;
while (count < 3) {
//move pointers from pixel (0,0) to pixel (1,1) = add one line step plus one, roi is two pixels smaller:
ret = nppiFilterGauss_8u_C1R(p1 + step1 + 1, step1, p2 + step2 + 1, step2, roi, NPP_MASK_SIZE_3_X_3);
printf("count[%d],ret[%d]\n", count, ret);
if (ret) {
break;
}
count++;
}
//Or use NPP function including border handling:
NppiPoint srcPoint;
srcPoint.x = 0;
srcPoint.y = 0;
roi.width = 352;
roi.height = 288;
ret = nppiFilterGaussBorder_8u_C1R(p1, step1, roi, srcPoint, p2, step2, roi, NPP_MASK_SIZE_3_X_3, NPP_BORDER_REPLICATE);
nppiFree(p1);
nppiFree(p2);
这段代码顺利通过了 CudaMemCheck。