带有 NPP LabelMarkers 的 Cuda 边界框

Cuda Bounding Box with NPP LabelMarkers

我正在尝试使用 cuda 库为我的输入数据找到边界框。 我从一个数据集开始,该数据集有噪声(可能还有一些归零的单元格),数据区域远高于噪声。

首先,我使用 nppiFilterGauss_32f_C1R 对我的数据应用高斯模糊。

然后我用 nppiCompareC_32f_C1R 阈值来创建二值图像。

在此之后,我使用 nppiLabelMarkers_8u32u_C1R 为每个区域创建一个独特的标签。

至此,我的结果如我所料。我留下了一个数据集,每个 "blob" 都有唯一的值(尽管数字之间有数字间隙)。

我一直在网上查找,但似乎找不到可以在 GPU 上找到标记组件的边界框的库。

我能够使用 findContours 和 BoundingRects 获得与 OpenCV 一起工作的完整流程,但这是在 CPU 上进行的工作,无法跟上我的数据速率。

是否有我缺少的 cuda 函数可以为我提供每个标记的 blob 的边界框参数?

谢谢!

在标签标记操作之后,如果我们然后 compress the label markers,我们可以实现一个相当简单的识别边界框的方法,在一个简单的 CUDA 内核中使用 atomicMaxatomicMin

这是一个有效的例子:

$ cat t1461.cu
#include <stdio.h>
#include <nppi_filtering_functions.h>
#include <assert.h>
#define WIDTH 16
#define HEIGHT 16
void my_print(Npp16u *data, int w, int h){
  for (int i = 0; i < h; i++)
    {
    for (int j = 0; j < w; j++)
      {
      if (data[i*w+j] == 255) printf("  *");
      else printf("%3hd", data[i * w + j]);
      }
    printf("\n");
  }

}

template <typename T>
__global__ void bb(const T * __restrict__ i, int * __restrict__ maxh, int * __restrict__ minh, int * __restrict__ maxw, int * __restrict__ minw, int height, int width){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  int idy = threadIdx.y+blockDim.y*blockIdx.y;
  if ((idx < width) && (idy < height)){
    T myval = i[idy*width+idx];
    if (myval > 0){
      atomicMax(maxw+myval-1, idx);
      atomicMin(minw+myval-1, idx);
      atomicMax(maxh+myval-1, idy);
      atomicMin(minh+myval-1, idy);}
  }
}

int main(){
Npp16u host_src[WIDTH * HEIGHT] =
{
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255,255, 0, 0,255, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0,255,255,255,
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,
0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,
0, 0,255,255,255, 0,255,255,255,255,255, 0, 0, 0, 0, 0,
0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};

  Npp16u * device_src;
  cudaMalloc((void**)&device_src, sizeof(Npp16u) * WIDTH * HEIGHT);
  cudaMemcpy(device_src, host_src, sizeof(Npp16u) * WIDTH * HEIGHT, cudaMemcpyHostToDevice);

  int buffer_size;
  NppiSize source_roi = { WIDTH, HEIGHT };
  NppStatus e = nppiLabelMarkersGetBufferSize_16u_C1R(source_roi, &buffer_size);
  assert(e == NPP_NO_ERROR);
  Npp8u * buffer;
  cudaMalloc((void**)&buffer, buffer_size);

  int max;
  e = nppiLabelMarkers_16u_C1IR(device_src, sizeof(Npp16u) * WIDTH, source_roi, (Npp16u)1, nppiNormInf, &max, buffer);
  assert(e == NPP_NO_ERROR);
  printf("initial max: %d\n", max);
  int bs;
  e = nppiCompressMarkerLabelsGetBufferSize_16u_C1R (1, &bs);
  assert(e == NPP_NO_ERROR);
  if (bs>buffer_size){
    buffer_size = bs;
    cudaFree(buffer);
    cudaMalloc(&buffer, buffer_size);}
  e = nppiCompressMarkerLabels_16u_C1IR(device_src, sizeof(Npp16u)*WIDTH, source_roi, max, &max, buffer);
  assert(e == NPP_NO_ERROR);
  int *maxw, *maxh, *minw, *minh, *d_maxw, *d_maxh, *d_minw, *d_minh;
  maxw = new int[max];
  maxh = new int[max];
  minw = new int[max];
  minh = new int[max];
  cudaMalloc(&d_maxw, max*sizeof(int));
  cudaMalloc(&d_maxh, max*sizeof(int));
  cudaMalloc(&d_minw, max*sizeof(int));
  cudaMalloc(&d_minh, max*sizeof(int));
  for (int i = 0; i < max; i++){
    maxw[i] = 0;
    maxh[i] = 0;
    minw[i] = WIDTH;
    minh[i] = HEIGHT;}
  cudaMemcpy(d_maxw, maxw, max*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_maxh, maxh, max*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_minw, minw, max*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_minh, minh, max*sizeof(int), cudaMemcpyHostToDevice);
  dim3 block(32,32);
  dim3 grid((WIDTH+block.x-1)/block.x, (HEIGHT+block.y-1)/block.y);
  bb<<<grid, block>>>(device_src, d_maxh, d_minh, d_maxw, d_minw, HEIGHT, WIDTH);
  cudaMemcpy(maxw, d_maxw, max*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(maxh, d_maxh, max*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(minw, d_minw, max*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(minh, d_minh, max*sizeof(int), cudaMemcpyDeviceToHost);

  Npp16u *dst = new Npp16u[WIDTH * HEIGHT];
  cudaMemcpy(dst, device_src, sizeof(Npp16u) * WIDTH * HEIGHT, cudaMemcpyDeviceToHost);

  printf("*******INPUT************\n");
  my_print(host_src, WIDTH, HEIGHT);
  printf("******OUTPUT************\n");
  my_print(dst, WIDTH,HEIGHT);
  printf("compressed max: %d\n", max);
  printf("bounding boxes:\n");
  for (int i = 0; i < max; i++)
    printf("label %d, maxh: %d, minh: %d, maxw: %d, minw: %d\n", i+1, maxh[i], minh[i], maxw[i], minw[i]);
}
$ nvcc -o t1461 t1461.cu -lnppif
$ cuda-memcheck ./t1461
========= CUDA-MEMCHECK
initial max: 10
*******INPUT************
  0  0  0  0  0  0  0  0  *  0  0  0  0  0  0  0
  0  *  *  *  0  0  *  *  *  0  0  0  0  0  0  0
  0  *  *  *  0  0  *  *  *  0  0  0  0  0  0  0
  0  *  *  *  0  0  *  *  *  *  0  0  *  0  0  0
  0  0  0  0  0  0  0  *  *  *  0  0  0  *  *  *
  0  0  0  0  0  0  0  0  *  0  0  0  0  *  *  *
  0  0  0  *  0  0  0  0  0  0  0  0  0  *  *  *
  0  *  *  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  *  *  *  0  0  0  0  *  0  0  0  0  0  0  0
  0  *  *  *  *  0  0  *  *  *  0  0  0  0  0  0
  0  0  *  *  *  0  *  *  *  *  *  0  0  0  0  0
  0  0  0  *  0  0  0  *  *  *  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  *  0  0  0  0  *  *  *
  0  *  *  *  0  0  0  0  0  0  0  0  0  *  *  *
  0  *  *  *  0  0  0  0  0  0  0  0  0  *  *  *
  0  *  *  *  0  0  0  0  0  0  0  0  0  0  0  0
******OUTPUT************
  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
  0  2  2  2  0  0  1  1  1  0  0  0  0  0  0  0
  0  2  2  2  0  0  1  1  1  0  0  0  0  0  0  0
  0  2  2  2  0  0  1  1  1  1  0  0  3  0  0  0
  0  0  0  0  0  0  0  1  1  1  0  0  0  3  3  3
  0  0  0  0  0  0  0  0  1  0  0  0  0  3  3  3
  0  0  0  4  0  0  0  0  0  0  0  0  0  3  3  3
  0  4  4  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  4  4  4  0  0  0  0  5  0  0  0  0  0  0  0
  0  4  4  4  4  0  0  5  5  5  0  0  0  0  0  0
  0  0  4  4  4  0  5  5  5  5  5  0  0  0  0  0
  0  0  0  4  0  0  0  5  5  5  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  5  0  0  0  0  6  6  6
  0  7  7  7  0  0  0  0  0  0  0  0  0  6  6  6
  0  7  7  7  0  0  0  0  0  0  0  0  0  6  6  6
  0  7  7  7  0  0  0  0  0  0  0  0  0  0  0  0
compressed max: 7
bounding boxes:
label 1, maxh: 5, minh: 0, maxw: 9, minw: 6
label 2, maxh: 3, minh: 1, maxw: 3, minw: 1
label 3, maxh: 6, minh: 3, maxw: 15, minw: 12
label 4, maxh: 11, minh: 6, maxw: 4, minw: 1
label 5, maxh: 12, minh: 8, maxw: 10, minw: 6
label 6, maxh: 14, minh: 12, maxw: 15, minw: 13
label 7, maxh: 15, minh: 13, maxw: 3, minw: 1
========= ERROR SUMMARY: 0 errors
$

请注意,如果您要重复执行此操作(例如识别视频帧上的边界框),您需要让 cudaMalloc 操作大部分脱离性能循环。

一种典型的方法是使用我已经在上面的代码中展示的用于分配 buffer 的方法。如果先前的大小太小,则仅释放并重新分配缓冲区。对于最大和最小缓冲区也是如此。