如何在 CUDA 中对图像进行卷积

How can I convolution image in CUDA

我对 CUDA 中的图像卷积有疑问。当我用小的 maxtrix (16*16) 测试它时,一切正常。但是对于更大的矩阵,结果总是在 I 运行 时发生变化。 我认为问题是 2 for 循环进入内核。

__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
                    int img_width, const int img_height,
                    const int kernel_width, const int kernel_height )
{

    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    float sum = 0;
    for ( int j = 0; j < kernel_height; j++ )
    {
        for ( int i = 0; i < kernel_width; i++ )
        {
            int dX = x + i - kernel_width / 2;
            int dY = y + j - kernel_height / 2;

            if ( dX < 0 )
                dX = 0;

            if ( dX >= img_width )
                dX = img_width - 1;

            if ( dY < 0 )
                dY = 0;

            if ( dY >= img_height )
                dY = img_height - 1;


            const int idMat = j * kernel_width + i;
            const int idPixel = dY * img_width + dX;
            sum += (float)input[idPixel] * kernelConv[idMat];
        }
    }

    const int idOut = y * img_width + x;
    out[idOut] = abs(sum);

}

  void image_convolution(float * input,float* output, int img_height, int img_width)
{
    int kernel_height = 3;
    int kernel_width = 3;
    float kernel[] ={ 0,-0.25,0,
                     -0.25,1,-0.25,
                      0,-0.25,0
                    };
    float * mask = new float[kernel_height*kernel_width];
    for (int i = 0; i < kernel_height*kernel_width; i++)
    {
        mask[i] = kernel[i];
    }

    float * d_input, * d_output, * d_kernel;
    cudaMalloc(&d_input, img_width*img_height*sizeof(float));
    cudaMalloc(&d_output, img_width*img_height*sizeof(float));
    cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));

    cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
    dim3 blocksize(16,16);
    dim3 gridsize;
    gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
    gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
    image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
    cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);

    for (int i=0; i < img_width*img_height; i++)
    {
         printf("%d, ",(int)output[i]);
    }
    printf("\n\n");
}

这是我的结果,我用 24*24 的图像测试它,我 运行 它 2 次,我还写了简单的函数来比较输出。

这是我比较输出时的结果,有 32 个不同,索引为 240、241 ....

您在程序中犯了一个相当常见的错误。当您创建这样的线程网格时:

dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;

您有意在每个维度中创建(通常)额外 个线程,以便完全覆盖问题space(即图像大小)。这没有错。

但是,这意味着我们将启动 额外的 个线程,这些线程超出了有效的图像尺寸。我们必须确保这些线程什么都不做。通常的做法是在内核中添加线程检查,使有效图像尺寸之外的线程什么都不做。这是一个修改后的内核和完整工作的示例,显示了该更改:

$ cat t1219.cu
#include <iostream>
#include <cstdlib>

const int iw = 1025;
const int ih = 1025;
const int rng = 10;

__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv,
                    int img_width, const int img_height,
                    const int kernel_width, const int kernel_height )
{

    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    if ((x < img_width) && (y < img_height)){  // thread check
      float sum = 0;
      for ( int j = 0; j < kernel_height; j++ )
      {
        for ( int i = 0; i < kernel_width; i++ )
        {
            int dX = x + i - kernel_width / 2;
            int dY = y + j - kernel_height / 2;

            if ( dX < 0 )
                dX = 0;

            if ( dX >= img_width )
                dX = img_width - 1;

            if ( dY < 0 )
                dY = 0;

            if ( dY >= img_height )
                dY = img_height - 1;


            const int idMat = j * kernel_width + i;
            const int idPixel = dY * img_width + dX;
            sum += (float)input[idPixel] * kernelConv[idMat];
        }
      }

      const int idOut = y * img_width + x;
      out[idOut] = abs(sum);
    }

}

  void image_convolution(float * input,float* output, int img_height, int img_width)
{
    int kernel_height = 3;
    int kernel_width = 3;
    float kernel[] ={ 0,-0.25,0,
                     -0.25,1,-0.25,
                      0,-0.25,0
                    };
    float * mask = new float[kernel_height*kernel_width];
    for (int i = 0; i < kernel_height*kernel_width; i++)
    {
        mask[i] = kernel[i];
    }

    float * d_input, * d_output, * d_kernel;
    cudaMalloc(&d_input, img_width*img_height*sizeof(float));
    cudaMalloc(&d_output, img_width*img_height*sizeof(float));
    cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float));

    cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice);
    dim3 blocksize(16,16);
    dim3 gridsize;
    gridsize.x=(img_width+blocksize.x-1)/blocksize.x;
    gridsize.y=(img_height+blocksize.y-1)/blocksize.y;
    image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height);
    cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost);
}

int main(){

  float *in, *out;
  int is = ih*iw;
  in  = new float[is];
  out = new float[is];
  for (int i = 0; i < is; i++) {in[i] = rand()%rng; out[i] = -1;}
  image_convolution(in,out, ih, iw);
  for (int iy = 1; iy < ih-1; iy++)
    for (int ix = 1; ix < iw-1; ix++){
      float temp = abs(-0.25 * (in[iy*iw + ix -1] + in[iy*iw + ix +1] + in[(iy-1)*iw + ix] + in[(iy+1)*iw + ix]) + in[iy*iw+ix]);
      if (out[iy*iw+ix] != temp) {std::cout << "mismatch x: " << ix << " y: " << iy << " was: " << out[iy*iw+ix] << " should be: " << temp << std::endl; return 1;}}
  return 0;
}
$ nvcc -o t1219 t1219.cu
$ cuda-memcheck ./t1219
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

对于块大小 (16,16) 的精确倍数的图像尺寸(我之前的测试用例也是如此),这个问题不会出现——代码会正常工作。对于所有其他测试用例,我们需要这样的线程检查。