状态为 execution_failed 的 cudnn RNN 实现
cudnn RNN implementation with status execution_failed
为了测试cudnn rnn apis,我写了一个简单的应用程序,看看我的理解是否正确;
代码是这样的,
int layernum = 1;
int batchnum = 32;
int hiddenSize = 64;
float* h_weight, *h_hx_, *h_cx_, *h_hy_, *h_cy_;
float* h_input, *d_input;
float* h_output, *d_output;
cudnnHandle_t cu_dnnHandle;
cudnnDropoutDescriptor_t cu_dropoutDesc = nullptr;
cudnnRNNDescriptor_t cu_rnnDesc;
cudnnTensorDescriptor_t hx_desc_;
cudnnTensorDescriptor_t cx_desc_;
cudnnTensorDescriptor_t hy_desc_;
cudnnTensorDescriptor_t cy_desc_;
checkCUDNN(cudnnCreate(&cu_dnnHandle));
checkCUDNN(cudnnCreateRNNDescriptor(&cu_rnnDesc));
checkCUDNN(cudnnCreateDropoutDescriptor(&cu_dropoutDesc));
float drop_rate = 0.0f;
unsigned long long seed = 1337ull;
checkCUDNN(cudnnSetDropoutDescriptor(cu_dropoutDesc, cu_dnnHandle, drop_rate, NULL, 0, seed));
checkCUDNN(cudnnSetRNNDescriptor_v5(cu_rnnDesc,
hiddenSize, //hiddenSize
layernum, //numLayers
cu_dropoutDesc,
//CUDNN_SKIP_INPUT,
CUDNN_LINEAR_INPUT,
CUDNN_UNIDIRECTIONAL,
CUDNN_LSTM,
CUDNN_DATA_FLOAT
));
checkCUDNN(cudnnCreateTensorDescriptor(&hx_desc_));
checkCUDNN(cudnnCreateTensorDescriptor(&cx_desc_));
checkCUDNN(cudnnCreateTensorDescriptor(&hy_desc_));
checkCUDNN(cudnnCreateTensorDescriptor(&cy_desc_));
std::vector<cudnnTensorDescriptor_t> tensorDescs;
std::vector<cudnnTensorDescriptor_t> resultDescs;
int seq_len = 8;
tensorDescs.resize(seq_len);
resultDescs.resize(seq_len);
for (auto i = 0; i < seq_len; i++)
{
int dims[3] = {batchnum, 64, 1}; //batch = 32
int strides[3] = {64, 1, 1};
cudnnCreateTensorDescriptor(&tensorDescs[i]);
cudnnSetTensorNdDescriptor(tensorDescs[i],
CUDNN_DATA_FLOAT,
3,
dims,
strides
);
cudnnCreateTensorDescriptor(&resultDescs[i]);
cudnnSetTensorNdDescriptor(resultDescs[i],
CUDNN_DATA_FLOAT,
3,
dims,
strides
);
}
size_t workspace_size = 0;
checkCUDNN(cudnnGetRNNWorkspaceSize(cu_dnnHandle,
cu_rnnDesc,
seq_len,
tensorDescs.data(),
&workspace_size));
std::cout << workspace_size << std::endl;
void* work_space_;
void* param_space_;
cudaMalloc(&work_space_, workspace_size);
cudnnFilterDescriptor_t w_desc_;
size_t param_size;
checkCUDNN(cudnnGetRNNParamsSize(cu_dnnHandle, cu_rnnDesc, tensorDescs[0],
¶m_size, CUDNN_DATA_FLOAT));
cudaMalloc(¶m_space_, param_size);
int w_dim[3] = { (int)(param_size / sizeof(float)), 1, 1 };
checkCUDNN(cudnnCreateFilterDescriptor(&w_desc_));
checkCUDNN(cudnnSetFilterNdDescriptor(w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, w_dim));
h_weight = new float[param_size/sizeof(float)];
srand(time(NULL));
for (int i = 0; i < param_size / sizeof(float); i++)
{
h_weight[i] = (float)rand() / RAND_MAX;
}
cudaMemcpy(param_space_, h_weight, param_size, cudaMemcpyHostToDevice);
h_input = new float[batchnum * 64];
for (int i = 0; i < batchnum * 64; i++)
{
h_input[i] = (float)rand() / RAND_MAX;
}
cudaMalloc(&d_input, batchnum * 64 * sizeof(float));
cudaMemcpy(d_input, h_input, batchnum * 64 * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&d_output, batchnum * 64 * sizeof(float));
int h_dim[3] = { layernum , batchnum , hiddenSize };
int stride[3] = { h_dim[1] * h_dim[2], h_dim[2], 1 };
checkCUDNN(cudnnSetTensorNdDescriptor(hx_desc_, CUDNN_DATA_FLOAT, 3, h_dim, stride));
checkCUDNN(cudnnSetTensorNdDescriptor(cx_desc_, CUDNN_DATA_FLOAT, 3, h_dim, stride));
checkCUDNN(cudnnSetTensorNdDescriptor(hy_desc_, CUDNN_DATA_FLOAT, 3, h_dim, stride));
checkCUDNN(cudnnSetTensorNdDescriptor(cy_desc_, CUDNN_DATA_FLOAT, 3, h_dim, stride));
int bunch_hidden_size = h_dim[0] * h_dim[1] * h_dim[2];
h_hx_ = new float[bunch_hidden_size];
h_cx_ = new float[bunch_hidden_size];
h_hy_ = new float[bunch_hidden_size];
h_cy_ = new float[bunch_hidden_size];
float *d_hx_, *d_cx_, *d_hy_, *d_cy_;
cudaMalloc(&d_hx_, bunch_hidden_size * sizeof(float));
cudaMalloc(&d_cx_, bunch_hidden_size * sizeof(float));
cudaMalloc(&d_hy_, bunch_hidden_size * sizeof(float));
cudaMalloc(&d_cy_, bunch_hidden_size * sizeof(float));
for (int i = 0; i < bunch_hidden_size; i++)
{
h_hx_[i] = (float)rand() / RAND_MAX;
h_cx_[i] = (float)rand() / RAND_MAX;
}
cudaMemcpy(d_hx_, h_hx_, bunch_hidden_size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_cx_, h_cx_, bunch_hidden_size * sizeof(float), cudaMemcpyHostToDevice);
checkCUDNN(cudnnRNNForwardInference(cu_dnnHandle,
cu_rnnDesc,
seq_len,
tensorDescs.data(),
d_input,
hx_desc_,
d_hx_,
cx_desc_,
d_cx_,
w_desc_,
param_space_,
resultDescs.data(),
d_output,
hy_desc_,
d_hy_,
cy_desc_,
d_cy_,
work_space_,
workspace_size
));
当应用程序调用cudnnRNNForwardInference api,报告CUDNN_STATUS_EXECUTION_FAILED,我不知道代码中的哪一部分有问题,对这些代码有什么建议吗?
找到原因了,
h_input = new float[batchnum * 64];
for (int i = 0; i < batchnum * 64; i++)
{
h_input[i] = (float)rand() / RAND_MAX;
}
cudaMalloc(&d_input, batchnum * 64 * sizeof(float));
cudaMemcpy(d_input, h_input, batchnum * 64 * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&d_output, batchnum * 64 * sizeof(float));
需要分配seq_len * batchnum * 64条数据。
为了测试cudnn rnn apis,我写了一个简单的应用程序,看看我的理解是否正确;
代码是这样的,
int layernum = 1;
int batchnum = 32;
int hiddenSize = 64;
float* h_weight, *h_hx_, *h_cx_, *h_hy_, *h_cy_;
float* h_input, *d_input;
float* h_output, *d_output;
cudnnHandle_t cu_dnnHandle;
cudnnDropoutDescriptor_t cu_dropoutDesc = nullptr;
cudnnRNNDescriptor_t cu_rnnDesc;
cudnnTensorDescriptor_t hx_desc_;
cudnnTensorDescriptor_t cx_desc_;
cudnnTensorDescriptor_t hy_desc_;
cudnnTensorDescriptor_t cy_desc_;
checkCUDNN(cudnnCreate(&cu_dnnHandle));
checkCUDNN(cudnnCreateRNNDescriptor(&cu_rnnDesc));
checkCUDNN(cudnnCreateDropoutDescriptor(&cu_dropoutDesc));
float drop_rate = 0.0f;
unsigned long long seed = 1337ull;
checkCUDNN(cudnnSetDropoutDescriptor(cu_dropoutDesc, cu_dnnHandle, drop_rate, NULL, 0, seed));
checkCUDNN(cudnnSetRNNDescriptor_v5(cu_rnnDesc,
hiddenSize, //hiddenSize
layernum, //numLayers
cu_dropoutDesc,
//CUDNN_SKIP_INPUT,
CUDNN_LINEAR_INPUT,
CUDNN_UNIDIRECTIONAL,
CUDNN_LSTM,
CUDNN_DATA_FLOAT
));
checkCUDNN(cudnnCreateTensorDescriptor(&hx_desc_));
checkCUDNN(cudnnCreateTensorDescriptor(&cx_desc_));
checkCUDNN(cudnnCreateTensorDescriptor(&hy_desc_));
checkCUDNN(cudnnCreateTensorDescriptor(&cy_desc_));
std::vector<cudnnTensorDescriptor_t> tensorDescs;
std::vector<cudnnTensorDescriptor_t> resultDescs;
int seq_len = 8;
tensorDescs.resize(seq_len);
resultDescs.resize(seq_len);
for (auto i = 0; i < seq_len; i++)
{
int dims[3] = {batchnum, 64, 1}; //batch = 32
int strides[3] = {64, 1, 1};
cudnnCreateTensorDescriptor(&tensorDescs[i]);
cudnnSetTensorNdDescriptor(tensorDescs[i],
CUDNN_DATA_FLOAT,
3,
dims,
strides
);
cudnnCreateTensorDescriptor(&resultDescs[i]);
cudnnSetTensorNdDescriptor(resultDescs[i],
CUDNN_DATA_FLOAT,
3,
dims,
strides
);
}
size_t workspace_size = 0;
checkCUDNN(cudnnGetRNNWorkspaceSize(cu_dnnHandle,
cu_rnnDesc,
seq_len,
tensorDescs.data(),
&workspace_size));
std::cout << workspace_size << std::endl;
void* work_space_;
void* param_space_;
cudaMalloc(&work_space_, workspace_size);
cudnnFilterDescriptor_t w_desc_;
size_t param_size;
checkCUDNN(cudnnGetRNNParamsSize(cu_dnnHandle, cu_rnnDesc, tensorDescs[0],
¶m_size, CUDNN_DATA_FLOAT));
cudaMalloc(¶m_space_, param_size);
int w_dim[3] = { (int)(param_size / sizeof(float)), 1, 1 };
checkCUDNN(cudnnCreateFilterDescriptor(&w_desc_));
checkCUDNN(cudnnSetFilterNdDescriptor(w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, w_dim));
h_weight = new float[param_size/sizeof(float)];
srand(time(NULL));
for (int i = 0; i < param_size / sizeof(float); i++)
{
h_weight[i] = (float)rand() / RAND_MAX;
}
cudaMemcpy(param_space_, h_weight, param_size, cudaMemcpyHostToDevice);
h_input = new float[batchnum * 64];
for (int i = 0; i < batchnum * 64; i++)
{
h_input[i] = (float)rand() / RAND_MAX;
}
cudaMalloc(&d_input, batchnum * 64 * sizeof(float));
cudaMemcpy(d_input, h_input, batchnum * 64 * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&d_output, batchnum * 64 * sizeof(float));
int h_dim[3] = { layernum , batchnum , hiddenSize };
int stride[3] = { h_dim[1] * h_dim[2], h_dim[2], 1 };
checkCUDNN(cudnnSetTensorNdDescriptor(hx_desc_, CUDNN_DATA_FLOAT, 3, h_dim, stride));
checkCUDNN(cudnnSetTensorNdDescriptor(cx_desc_, CUDNN_DATA_FLOAT, 3, h_dim, stride));
checkCUDNN(cudnnSetTensorNdDescriptor(hy_desc_, CUDNN_DATA_FLOAT, 3, h_dim, stride));
checkCUDNN(cudnnSetTensorNdDescriptor(cy_desc_, CUDNN_DATA_FLOAT, 3, h_dim, stride));
int bunch_hidden_size = h_dim[0] * h_dim[1] * h_dim[2];
h_hx_ = new float[bunch_hidden_size];
h_cx_ = new float[bunch_hidden_size];
h_hy_ = new float[bunch_hidden_size];
h_cy_ = new float[bunch_hidden_size];
float *d_hx_, *d_cx_, *d_hy_, *d_cy_;
cudaMalloc(&d_hx_, bunch_hidden_size * sizeof(float));
cudaMalloc(&d_cx_, bunch_hidden_size * sizeof(float));
cudaMalloc(&d_hy_, bunch_hidden_size * sizeof(float));
cudaMalloc(&d_cy_, bunch_hidden_size * sizeof(float));
for (int i = 0; i < bunch_hidden_size; i++)
{
h_hx_[i] = (float)rand() / RAND_MAX;
h_cx_[i] = (float)rand() / RAND_MAX;
}
cudaMemcpy(d_hx_, h_hx_, bunch_hidden_size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_cx_, h_cx_, bunch_hidden_size * sizeof(float), cudaMemcpyHostToDevice);
checkCUDNN(cudnnRNNForwardInference(cu_dnnHandle,
cu_rnnDesc,
seq_len,
tensorDescs.data(),
d_input,
hx_desc_,
d_hx_,
cx_desc_,
d_cx_,
w_desc_,
param_space_,
resultDescs.data(),
d_output,
hy_desc_,
d_hy_,
cy_desc_,
d_cy_,
work_space_,
workspace_size
));
当应用程序调用cudnnRNNForwardInference api,报告CUDNN_STATUS_EXECUTION_FAILED,我不知道代码中的哪一部分有问题,对这些代码有什么建议吗?
找到原因了,
h_input = new float[batchnum * 64];
for (int i = 0; i < batchnum * 64; i++)
{
h_input[i] = (float)rand() / RAND_MAX;
}
cudaMalloc(&d_input, batchnum * 64 * sizeof(float));
cudaMemcpy(d_input, h_input, batchnum * 64 * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&d_output, batchnum * 64 * sizeof(float));
需要分配seq_len * batchnum * 64条数据。