CuDNN 减少格式错误
CuDNN Reduce Format Bug
我真的不想在这里转储大量代码,但我希望它是可编译的。以下用于演示CuDNN中可能存在的错误(很可能是误解)。
#include <vector>
#include <cudnn.h>
#include <cuda.h>
#include <iostream>
#include <sstream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudnnStatus_t code, const char *file, int line, bool abort=true)
{
if (code != CUDNN_STATUS_SUCCESS)
{
std::stringstream ss;
ss << "CuDNNassert: (" << code << ") " << cudnnGetErrorString(code) << " " << file << " " << line;
std::cerr << ss.str() << std::endl;
if (abort)
{
throw std::runtime_error(ss.str());
}
}
}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
std::stringstream ss;
ss << "CUDAassert: (" << code << ") " << cudaGetErrorString(code) << " " << file << " " << line;
std::cerr << ss.str() << std::endl;
if (abort)
{
throw std::runtime_error(ss.str());
}
}
}
template<typename T>
cudnnDataType_t getCudnnType()
{
if(std::is_same<T, float>::value)
return CUDNN_DATA_FLOAT;
else if(std::is_same<T, double>::value)
return CUDNN_DATA_DOUBLE;
else if(std::is_same<T, int>::value)
return CUDNN_DATA_INT32;
else if(std::is_same<T, char>::value)
return CUDNN_DATA_INT8;
else
throw std::runtime_error("Cannot use any other type of");
}
template<typename T>
void _reduce(cudnnHandle_t& cudnn, T* gpuA, T** gpuB,
int n, int h, int w, int c,
int outN, int outH, int outW, int outC,
cudnnReduceTensorOp_t reduceType, cudnnTensorFormat_t format)
{
gpuErrchk( cudaMalloc(gpuB, outN*outH*outW*outC*sizeof(T)) );
gpuErrchk( cudaMemset(*gpuB, 0, outN*outH*outW*outC*sizeof(T)) );
cudnnDataType_t dType = getCudnnType<T>();
cudnnTensorDescriptor_t inputDescriptor;
gpuErrchk( cudnnCreateTensorDescriptor(&inputDescriptor) );
gpuErrchk( cudnnSetTensor4dDescriptor(inputDescriptor,
format,
dType,
n, c, h, w) );
cudnnTensorDescriptor_t outputDescriptor;
gpuErrchk( cudnnCreateTensorDescriptor(&outputDescriptor) );
gpuErrchk( cudnnSetTensor4dDescriptor(outputDescriptor,
format,
dType,
outN, outC, outH, outW) );
cudnnReduceTensorDescriptor_t reduceTensorDesc;
gpuErrchk( cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) );
gpuErrchk( cudnnSetReduceTensorDescriptor(reduceTensorDesc,
reduceType,
dType,
CUDNN_NOT_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_NO_INDICES,
CUDNN_8BIT_INDICES) );
size_t workspaceSize;
gpuErrchk( cudnnGetReductionWorkspaceSize(cudnn,
reduceTensorDesc,
inputDescriptor,
outputDescriptor,
&workspaceSize) );
size_t indicesSize;
gpuErrchk( cudnnGetReductionIndicesSize(cudnn,
reduceTensorDesc,
inputDescriptor,
outputDescriptor,
&indicesSize) );
float alpha = 1;
float beta = 0;
void* gpuWorkspace;
gpuErrchk( cudaMalloc(&gpuWorkspace, workspaceSize) );
void* gpuIndices;
gpuErrchk( cudaMalloc(&gpuIndices, indicesSize) );
gpuErrchk( cudnnReduceTensor(cudnn,
reduceTensorDesc,
gpuIndices, indicesSize,
gpuWorkspace, workspaceSize,
&alpha,
inputDescriptor, gpuA,
&beta,
outputDescriptor, *gpuB) );
gpuErrchk( cudaDeviceSynchronize() );
gpuErrchk( cudnnDestroyReduceTensorDescriptor(reduceTensorDesc) );
gpuErrchk( cudnnDestroyTensorDescriptor(inputDescriptor) );
gpuErrchk( cudnnDestroyTensorDescriptor(outputDescriptor) );
gpuErrchk( cudaFree(gpuIndices) );
gpuErrchk( cudaFree(gpuWorkspace) );
}
int main(int argc, char **argv) {
std::cout << "cudnn ver: " << CUDNN_MAJOR << "." << CUDNN_MINOR << "." << CUDNN_PATCHLEVEL << std::endl;
cudnnHandle_t cudnn;
gpuErrchk( cudnnCreate(&cudnn) );
std::vector<float> in = {3,5,7,11,13,17,19,23,29,31};
//NHWC: 3, 7, 13, 19, 29
// 5, 11, 17, 23, 31
//HCHW: 3, 5, 7, 11, 13
// 17, 19, 23, 29, 31
float* data_d;
int n = 1, h = 1, w = 5, c = 2;
size_t numElem = n*h*w*c;
size_t arrSize = numElem*sizeof(float);
//buffer to print results
std::vector<float> cpuRes(5);
gpuErrchk( cudaMalloc((void**) &data_d, arrSize) );
gpuErrchk( cudaMemcpy(data_d, &in[0], arrSize, cudaMemcpyHostToDevice) );
float* res_d;
_reduce(cudnn, data_d, &res_d,
n, h, w, c,
1, 1, 5, 1, //reduce along channels
CUDNN_REDUCE_TENSOR_ADD, CUDNN_TENSOR_NHWC); //use intended format
gpuErrchk( cudaMemcpy(&cpuRes[0], res_d, 5*sizeof(float), cudaMemcpyDeviceToHost) );
std::cout << "[";
for(auto& v : cpuRes)
std::cout << v << ",";
std::cout << "]" << std::endl;
//expected: [8,18,30,42,60,]
//result: [20,24,30,40,44,]
gpuErrchk( cudaFree(res_d) ); //next call will alloc again
_reduce(cudnn, data_d, &res_d,
n, h, w, c,
1, 1, 5, 1, //reduce along channels
CUDNN_REDUCE_TENSOR_ADD, CUDNN_TENSOR_NCHW); //use other format
gpuErrchk( cudaMemcpy(&cpuRes[0], res_d, 5*sizeof(float), cudaMemcpyDeviceToHost) );
std::cout << "[";
for(auto& v : cpuRes)
std::cout << v << ",";
std::cout << "]" << std::endl;
//expected: [20,24,30,40,44,]
//result: [20,24,30,40,44,]
gpuErrchk( cudaFree(res_d) );
gpuErrchk( cudaFree(data_d) );
gpuErrchk( cudnnDestroy(cudnn) );
return 0;
}
如果你想自己测试,这里是我用来编译的 cmake
文件:
cmake_minimum_required(VERSION 3.0)
project(Main)
find_package(OpenCV REQUIRED)
find_package(CUDA REQUIRED)
#find_package(CUDNN REQUIRED)
set(CMAKE_CXX_FLAGS "--std=c++11 -Wall -fPIC -D_GLIBCXX_USE_CXX11_ABI=0 -D GOOGLE_CUDA=1")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --default-stream per-thread" )
set(CMAKE_BUILD_TYPE Debug)
#pass flags to c++ compiler
set(CUDA_PROPAGATE_HOST_FLAGS ON)
set(MAIN_SRC
"main.cu"
)
include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
cuda_add_executable(Main ${MAIN_SRC})
target_link_libraries(Main ${OpenCV_LIBS} ${CUDA_LIBRARIES} cudnn stdc++fs)
控制台的输出是:
cudnn ver: 7.3.1
[20,24,30,40,44,]
[20,24,30,40,44,]
这显然是错误的输出。当沿相同维度(即 [8,18,30,42,60,]
)减少时,更改维度顺序应该会导致不同的值。
即使使用 cudnnSetTensor4dDescriptorEx
为每个步幅设置步幅似乎也不起作用,将其用作每个步幅的计算:
int ns = c*w*h;
int cs = 1;
int hs = c*w;
int ws = c;
查看可通过下载 CuDNN 库获得的示例,他们使用 cudnnSetTensorNdDescriptor
而不是 cudnnSetTensor4dDescriptor
。但是 cudnnSetTensorNdDescriptor
的文档指出:
When working with lower dimensional data, it is recommended that the
user create a 4D tensor, and set the size along unused dimensions to
1.
鉴于您需要自己计算 cudnnSetTensorNdDescriptor
的步幅,最好使用 cudnnSetTensor4dDescriptor
.
这是 CuDNN 中的错误还是我的代码有什么我没有看到的错误?
上述代码的问题是我代码中的一个非常愚蠢的错误。来自 documentation:
C = alpha * reduce op ( A ) + beta * C
和
The data types of the tensors A and C must match if of type double. In this case, alpha and beta and the computation enum of reduceTensorDesc are all assumed to be of type double.
错误在两行代码中:
float alpha = 1;
float beta = 0;
应该是:
T alpha = 1;
T beta = 0;
这两个浮点数被解释为double,并乘以reduce操作的结果,本质上是垃圾数据。
我真的不想在这里转储大量代码,但我希望它是可编译的。以下用于演示CuDNN中可能存在的错误(很可能是误解)。
#include <vector>
#include <cudnn.h>
#include <cuda.h>
#include <iostream>
#include <sstream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudnnStatus_t code, const char *file, int line, bool abort=true)
{
if (code != CUDNN_STATUS_SUCCESS)
{
std::stringstream ss;
ss << "CuDNNassert: (" << code << ") " << cudnnGetErrorString(code) << " " << file << " " << line;
std::cerr << ss.str() << std::endl;
if (abort)
{
throw std::runtime_error(ss.str());
}
}
}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
std::stringstream ss;
ss << "CUDAassert: (" << code << ") " << cudaGetErrorString(code) << " " << file << " " << line;
std::cerr << ss.str() << std::endl;
if (abort)
{
throw std::runtime_error(ss.str());
}
}
}
template<typename T>
cudnnDataType_t getCudnnType()
{
if(std::is_same<T, float>::value)
return CUDNN_DATA_FLOAT;
else if(std::is_same<T, double>::value)
return CUDNN_DATA_DOUBLE;
else if(std::is_same<T, int>::value)
return CUDNN_DATA_INT32;
else if(std::is_same<T, char>::value)
return CUDNN_DATA_INT8;
else
throw std::runtime_error("Cannot use any other type of");
}
template<typename T>
void _reduce(cudnnHandle_t& cudnn, T* gpuA, T** gpuB,
int n, int h, int w, int c,
int outN, int outH, int outW, int outC,
cudnnReduceTensorOp_t reduceType, cudnnTensorFormat_t format)
{
gpuErrchk( cudaMalloc(gpuB, outN*outH*outW*outC*sizeof(T)) );
gpuErrchk( cudaMemset(*gpuB, 0, outN*outH*outW*outC*sizeof(T)) );
cudnnDataType_t dType = getCudnnType<T>();
cudnnTensorDescriptor_t inputDescriptor;
gpuErrchk( cudnnCreateTensorDescriptor(&inputDescriptor) );
gpuErrchk( cudnnSetTensor4dDescriptor(inputDescriptor,
format,
dType,
n, c, h, w) );
cudnnTensorDescriptor_t outputDescriptor;
gpuErrchk( cudnnCreateTensorDescriptor(&outputDescriptor) );
gpuErrchk( cudnnSetTensor4dDescriptor(outputDescriptor,
format,
dType,
outN, outC, outH, outW) );
cudnnReduceTensorDescriptor_t reduceTensorDesc;
gpuErrchk( cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) );
gpuErrchk( cudnnSetReduceTensorDescriptor(reduceTensorDesc,
reduceType,
dType,
CUDNN_NOT_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_NO_INDICES,
CUDNN_8BIT_INDICES) );
size_t workspaceSize;
gpuErrchk( cudnnGetReductionWorkspaceSize(cudnn,
reduceTensorDesc,
inputDescriptor,
outputDescriptor,
&workspaceSize) );
size_t indicesSize;
gpuErrchk( cudnnGetReductionIndicesSize(cudnn,
reduceTensorDesc,
inputDescriptor,
outputDescriptor,
&indicesSize) );
float alpha = 1;
float beta = 0;
void* gpuWorkspace;
gpuErrchk( cudaMalloc(&gpuWorkspace, workspaceSize) );
void* gpuIndices;
gpuErrchk( cudaMalloc(&gpuIndices, indicesSize) );
gpuErrchk( cudnnReduceTensor(cudnn,
reduceTensorDesc,
gpuIndices, indicesSize,
gpuWorkspace, workspaceSize,
&alpha,
inputDescriptor, gpuA,
&beta,
outputDescriptor, *gpuB) );
gpuErrchk( cudaDeviceSynchronize() );
gpuErrchk( cudnnDestroyReduceTensorDescriptor(reduceTensorDesc) );
gpuErrchk( cudnnDestroyTensorDescriptor(inputDescriptor) );
gpuErrchk( cudnnDestroyTensorDescriptor(outputDescriptor) );
gpuErrchk( cudaFree(gpuIndices) );
gpuErrchk( cudaFree(gpuWorkspace) );
}
int main(int argc, char **argv) {
std::cout << "cudnn ver: " << CUDNN_MAJOR << "." << CUDNN_MINOR << "." << CUDNN_PATCHLEVEL << std::endl;
cudnnHandle_t cudnn;
gpuErrchk( cudnnCreate(&cudnn) );
std::vector<float> in = {3,5,7,11,13,17,19,23,29,31};
//NHWC: 3, 7, 13, 19, 29
// 5, 11, 17, 23, 31
//HCHW: 3, 5, 7, 11, 13
// 17, 19, 23, 29, 31
float* data_d;
int n = 1, h = 1, w = 5, c = 2;
size_t numElem = n*h*w*c;
size_t arrSize = numElem*sizeof(float);
//buffer to print results
std::vector<float> cpuRes(5);
gpuErrchk( cudaMalloc((void**) &data_d, arrSize) );
gpuErrchk( cudaMemcpy(data_d, &in[0], arrSize, cudaMemcpyHostToDevice) );
float* res_d;
_reduce(cudnn, data_d, &res_d,
n, h, w, c,
1, 1, 5, 1, //reduce along channels
CUDNN_REDUCE_TENSOR_ADD, CUDNN_TENSOR_NHWC); //use intended format
gpuErrchk( cudaMemcpy(&cpuRes[0], res_d, 5*sizeof(float), cudaMemcpyDeviceToHost) );
std::cout << "[";
for(auto& v : cpuRes)
std::cout << v << ",";
std::cout << "]" << std::endl;
//expected: [8,18,30,42,60,]
//result: [20,24,30,40,44,]
gpuErrchk( cudaFree(res_d) ); //next call will alloc again
_reduce(cudnn, data_d, &res_d,
n, h, w, c,
1, 1, 5, 1, //reduce along channels
CUDNN_REDUCE_TENSOR_ADD, CUDNN_TENSOR_NCHW); //use other format
gpuErrchk( cudaMemcpy(&cpuRes[0], res_d, 5*sizeof(float), cudaMemcpyDeviceToHost) );
std::cout << "[";
for(auto& v : cpuRes)
std::cout << v << ",";
std::cout << "]" << std::endl;
//expected: [20,24,30,40,44,]
//result: [20,24,30,40,44,]
gpuErrchk( cudaFree(res_d) );
gpuErrchk( cudaFree(data_d) );
gpuErrchk( cudnnDestroy(cudnn) );
return 0;
}
如果你想自己测试,这里是我用来编译的 cmake
文件:
cmake_minimum_required(VERSION 3.0)
project(Main)
find_package(OpenCV REQUIRED)
find_package(CUDA REQUIRED)
#find_package(CUDNN REQUIRED)
set(CMAKE_CXX_FLAGS "--std=c++11 -Wall -fPIC -D_GLIBCXX_USE_CXX11_ABI=0 -D GOOGLE_CUDA=1")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --default-stream per-thread" )
set(CMAKE_BUILD_TYPE Debug)
#pass flags to c++ compiler
set(CUDA_PROPAGATE_HOST_FLAGS ON)
set(MAIN_SRC
"main.cu"
)
include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
cuda_add_executable(Main ${MAIN_SRC})
target_link_libraries(Main ${OpenCV_LIBS} ${CUDA_LIBRARIES} cudnn stdc++fs)
控制台的输出是:
cudnn ver: 7.3.1
[20,24,30,40,44,]
[20,24,30,40,44,]
这显然是错误的输出。当沿相同维度(即 [8,18,30,42,60,]
)减少时,更改维度顺序应该会导致不同的值。
即使使用 cudnnSetTensor4dDescriptorEx
为每个步幅设置步幅似乎也不起作用,将其用作每个步幅的计算:
int ns = c*w*h;
int cs = 1;
int hs = c*w;
int ws = c;
查看可通过下载 CuDNN 库获得的示例,他们使用 cudnnSetTensorNdDescriptor
而不是 cudnnSetTensor4dDescriptor
。但是 cudnnSetTensorNdDescriptor
的文档指出:
When working with lower dimensional data, it is recommended that the user create a 4D tensor, and set the size along unused dimensions to 1.
鉴于您需要自己计算 cudnnSetTensorNdDescriptor
的步幅,最好使用 cudnnSetTensor4dDescriptor
.
这是 CuDNN 中的错误还是我的代码有什么我没有看到的错误?
上述代码的问题是我代码中的一个非常愚蠢的错误。来自 documentation:
C = alpha * reduce op ( A ) + beta * C
和
The data types of the tensors A and C must match if of type double. In this case, alpha and beta and the computation enum of reduceTensorDesc are all assumed to be of type double.
错误在两行代码中:
float alpha = 1;
float beta = 0;
应该是:
T alpha = 1;
T beta = 0;
这两个浮点数被解释为double,并乘以reduce操作的结果,本质上是垃圾数据。