尝试使用 FFT 卷积时不支持 cuDNN 状态

cuDNN Status Not Supported when trying to use FFT Convolution

我正在尝试使用 cuDNN 库进行 FFT 卷积。当我使用Winograd卷积/选择最快卷积方法的cuDNN方法时,代码运行s,但是当我尝试运行使用FFT卷积方法时,它不起作用。

我自己把前向方法设置为FFT卷积。

我检查了文档,我的输入是 FFT 卷积所需的 NCHW 格式。来自文档:

CUDNN_CONVOLUTION_FWD_ALGO_FFT
xDesc Format Support: NCHW HW-packed
yDesc Format Support: NCHW HW-packed

cudnnGetConvolutionForwardWorkspaceSize 函数调用期间发生错误"CUDNN_STATUS_NOT_SUPPORTED"。

当我使用 FFT 卷积 VS best 或 Winograd 时,发生了什么导致此错误?

作为参考,我使用的是 cuda 9.1、cuDNN 7。我在 Ubuntu 16.04 上使用以下命令进行编译:nvcc -arch=sm_35 -std=c++11 -O2 -lcudnn FFT_cuDNN.cu -o 转换 pkg-config --cflags --libs opencv; ./conv TF.png

#include <cudnn.h>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>

using namespace cv;
using namespace cv::dnn;

#define checkCUDNN(expression)                               \
  {                                                          \
    cudnnStatus_t status = (expression);                     \
    if (status != CUDNN_STATUS_SUCCESS) {                    \
      std::cerr << "Error on line " << __LINE__ << ": "      \
                << cudnnGetErrorString(status) << std::endl; \
      std::exit(EXIT_FAILURE);                               \
    }                                                        \
  }

cv::Mat load_image_NCHW(const char* image_path)
{
    cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
    image.convertTo(image, CV_32FC3);
    cv::normalize(image,image,0,1, cv::NORM_MINMAX);

    cv::Mat inputBlob = blobFromImage(image, 1.0f, cv::Size(image.rows,image.cols), cv::Scalar(0,0,0));
    return inputBlob;
}

void save_image(const char* output_filename,
                float* buffer,
                int height,
                int width) {
  cv::Mat output_image(height, width, CV_32FC3, buffer);
  // Make negative values zero.
  cv::threshold(output_image,
                output_image,
                /*threshold=*/0,
                /*maxval=*/0,
                cv::THRESH_TOZERO);
  cv::normalize(output_image, output_image, 0.0, 255.0, cv::NORM_MINMAX);
  output_image.convertTo(output_image, CV_8UC3);
  cv::imwrite(output_filename, output_image);
  std::cerr << "Wrote output to " << output_filename << std::endl;
}

int main(int argc, const char* argv[]) {
  if (argc < 2) {
    std::cerr << "usage: conv <image> [gpu=0] [sigmoid=0]" << std::endl;
    std::exit(EXIT_FAILURE);
  }

  int gpu_id = (argc > 2) ? std::atoi(argv[2]) : 0;
  std::cerr << "GPU: " << gpu_id << std::endl;

  bool with_sigmoid = (argc > 3) ? std::atoi(argv[3]) : 0;
  std::cerr << "With sigmoid: " << std::boolalpha << with_sigmoid << std::endl;

  // Load the image
  cv::Mat image = load_image_NCHW(argv[1]);

  int imgH = 600;
  int imgW = 561;
  int inC = 3;

  // Set GPU to use
  cudaSetDevice(gpu_id);

  // Create the cudnn Handle
  cudnnHandle_t cudnn;
  checkCUDNN(cudnnCreate(&cudnn));

  // Need a descriptor for
  // The input, kernel, and convolution

  cudnnTensorDescriptor_t input_descriptor;
  checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
  checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
                                        /*format=*/CUDNN_TENSOR_NCHW,
                                        /*dataType=*/CUDNN_DATA_FLOAT,
                                        /*batch_size=*/1,
                                        /*channels=*/inC,
                                        /*image_height=*/imgH,
                                        /*image_width=*/imgW));

  cudnnFilterDescriptor_t kernel_descriptor;
  checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
  checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
                                        /*dataType=*/CUDNN_DATA_FLOAT,
                                        /*format=*/CUDNN_TENSOR_NCHW,
                                        /*out_channels=*/3,
                                        /*in_channels=*/inC,
                                        /*kernel_height=*/3,
                                        /*kernel_width=*/3));

  cudnnConvolutionDescriptor_t convolution_descriptor;
  checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
  checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
                                             /*pad_height=*/1,
                                             /*pad_width=*/1,
                                             /*vertical_stride=*/1,
                                             /*horizontal_stride=*/1,
                                             /*dilation_height=*/1,
                                             /*dilation_width=*/1,
                                             /*mode=*/CUDNN_CROSS_CORRELATION,
                                             /*computeType=*/CUDNN_DATA_FLOAT));

  // Need to compute the output size
  int batch_size{0}, channels{0}, height{0}, width{0};
  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
                                                   input_descriptor,
                                                   kernel_descriptor,
                                                   &batch_size,
                                                   &channels,
                                                   &height,
                                                   &width));

  std::cerr << "Output Image: " << height << " x " << width << " x " << channels
            << std::endl;

  // Need an output descriptor
  cudnnTensorDescriptor_t output_descriptor;
  checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
  checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
                                        /*format=*/CUDNN_TENSOR_NCHW,
                                        /*dataType=*/CUDNN_DATA_FLOAT,
                                        /*batch_size=*/1,
                                        /*channels=*/3,
                                        /*image_height=*/imgH,
                                        /*image_width=*/imgW));

  // Need to define the forward algorithm
  cudnnConvolutionFwdAlgo_t convolution_algorithm = CUDNN_CONVOLUTION_FWD_ALGO_FFT;

  // Have to compute the workspace size
  size_t workspace_bytes{0};
  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
                                                     input_descriptor,
                                                     kernel_descriptor,
                                                     convolution_descriptor,
                                                     output_descriptor,
                                                     convolution_algorithm,
                                                     &workspace_bytes));

  std::cerr << "Workspace size: " << (workspace_bytes / 1048576.0) << "MB"
            << std::endl;
  assert(workspace_bytes > 0);

  // Allocate the memory needed for the workspace
  void* d_workspace{nullptr};
  cudaMalloc(&d_workspace, workspace_bytes);

  // Allocate memory for the batch of images
  // and copy from host to device
  int image_bytes = batch_size * channels * height * width * sizeof(float);

  float* d_input{nullptr};
  cudaMalloc(&d_input, image_bytes);
  cudaMemcpy(d_input, image.ptr<float>(0), image_bytes, cudaMemcpyHostToDevice);

  // Allocate memory for the output images
  // Copy from host to device
  float* d_output{nullptr};
  cudaMalloc(&d_output, image_bytes);
  cudaMemset(d_output, 0, image_bytes);

  // clang-format off
  const float kernel_template[3][3] = {
    {1, 1, 1},
    {1, -8, 1},
    {1, 1, 1}
  };
  // clang-format on

  float h_kernel[3][3][3][3];
  for (int kernel = 0; kernel < 3; ++kernel) {
    for (int channel = 0; channel < 3; ++channel) {
      for (int row = 0; row < 3; ++row) {
        for (int column = 0; column < 3; ++column) {
          h_kernel[kernel][channel][row][column] = kernel_template[row][column];
        }
      }
    }
  }

  float* d_kernel{nullptr};
  cudaMalloc(&d_kernel, sizeof(h_kernel));
  cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice);

  // Perform actual convolution
  const float alpha = 1.0f, beta = 0.0f;

  checkCUDNN(cudnnConvolutionForward(cudnn,
                                     &alpha,
                                     input_descriptor,
                                     d_input,
                                     kernel_descriptor,
                                     d_kernel,
                                     convolution_descriptor,
                                     convolution_algorithm,
                                     d_workspace,
                                     workspace_bytes,
                                     &beta,
                                     output_descriptor,
                                     d_output));

  // If wish to use sigmoid activation
  if (with_sigmoid) {
    cudnnActivationDescriptor_t activation_descriptor;
    checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
    checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
                                            CUDNN_ACTIVATION_SIGMOID,
                                            CUDNN_PROPAGATE_NAN,
                                            /*relu_coef=*/0));
    checkCUDNN(cudnnActivationForward(cudnn,
                                      activation_descriptor,
                                      &alpha,
                                      output_descriptor,
                                      d_output,
                                      &beta,
                                      output_descriptor,
                                      d_output));
    cudnnDestroyActivationDescriptor(activation_descriptor);
  }

  // Move results to host
  float* h_output = new float[image_bytes];
  cudaMemcpy(h_output, d_output, image_bytes, cudaMemcpyDeviceToHost);

  save_image("cudnn-out.png", h_output, height, width);

  // Free memory
  delete[] h_output;
  cudaFree(d_kernel);
  cudaFree(d_input);
  cudaFree(d_output);
  cudaFree(d_workspace);

  cudnnDestroyTensorDescriptor(input_descriptor);
  cudnnDestroyTensorDescriptor(output_descriptor);
  cudnnDestroyFilterDescriptor(kernel_descriptor);
  cudnnDestroyConvolutionDescriptor(convolution_descriptor);

  cudnnDestroy(cudnn);
}

我从文档中弄明白了:xDesc 的特征图高度 + 2 * convDesc 的零填充高度必须等于或小于 256 xDesc 的特征图宽度 + 2 * convDesc 的零填充宽度必须等于或小于 256。

当我最初阅读它时,我的印象是零填充高度意味着 kernelH-1,当它指的是填充图像的总高度/宽度时。

我的图片太大了。如果我调整它的大小,即:

cv::Mat inputBlob = blobFromImage(image, 1.0f, cv::Size(100,100), cv::Scalar(0,0,0));