矩阵向量乘法 (cublasDgemv) returns 零
Matrix-vector multiplication (cublasDgemv) returns zero
我第一次尝试 CUDA/cuBLAS,我尝试编写一个简单的函数,将 MxN 矩阵(用向量的向量表示,std::vector
)与 Nx1 "ones"向量,从而得到矩阵的行(?)和。这将利用 cublas_gemv()
加上其他基本 CUDA 操作,我认为这是一个很好的起点。
在处理了设置问题和 reading/copying 示例代码之后,我得到的是:
std::vector<double> test(std::vector<std::vector<double>> in)
{
std::vector<double> out;
long in_m = in.size();
long in_n = in[0].size();
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
// This just converts a vector-of-vectors into a col-first array
double* p_in = vec2d_to_colfirst_array(in);
double* p_ones = new double[in_n];
double* p_out = new double[in_m];
std::fill(p_ones, p_ones + in_n, 1.0);
double* dev_in;
double* dev_ones;
double* dev_out;
cudaStat = cudaMalloc((void**)&dev_in, in_m * in_n * sizeof(double));
cudaStat = cudaMalloc((void**)&dev_ones, in_n * sizeof(double));
cudaStat = cudaMalloc((void**)&dev_out, in_m * sizeof(double));
stat = cublasCreate(&handle);
cudaStat = cudaMemcpy(dev_in, p_in, in_m*in_n * sizeof(double), cudaMemcpyHostToDevice);
cudaStat = cudaMemcpy(dev_ones, p_ones, in_n * sizeof(double), cudaMemcpyHostToDevice);
double alpha = 1.0;
double beta = 0.0;
stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_ones, 1);
cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
out.assign(p_out, p_out + in_m);
cudaFree(dev_in);
cudaFree(dev_ones);
cudaFree(dev_out);
cublasDestroy(handle);
free(p_in);
free(p_ones);
free(p_out);
return out;
}
它看起来与我阅读的示例没有太大区别,所以我预计它会 "just work"。但是,当我检查 p_out
时,它全为零。我肯定没有输入零 in
矩阵。
我验证了 vec2d_to_colfirst_array()
工作正常,并且通过将数据从设备复制回主机然后读取,dev_in
/dev_ones
已正确填充。也许问题出在对 cublasDgemv()
的调用中,但由于我是新手(而且因为与 Eigen 等相比,BLAS 语法更不直观),在经历了很多挫折之后,我只是看不出出了什么问题。
感谢任何帮助!
错误看起来相当简单。您希望从 dev_out
:
复制结果
cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
但是你从来没有在你的 cublas 调用中使用 dev_out
:
stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_ones, 1);
这似乎只是一个复制粘贴错误。如果您将 cublas 调用中的最后一个 dev_ones
实例替换为 dev_out
,您的代码适用于我:
stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_out, 1);
这是一个完整的示例,其中包含该更改:
$ cat t315.cu
#include <vector>
#include <cublas_v2.h>
#include <iostream>
const long idim1 = 8;
const long idim2 = 8;
double* vec2d_to_colfirst_array(std::vector<std::vector<double>> in){
long dim1 = in.size();
long dim2 = in[0].size();
long k = 0;
double *res = new double[dim1*dim2];
for (int i = 0; i < dim1; i++)
for (int j = 0; j < dim2; j++) res[k++] = in[i][j];
return res;
}
std::vector<double> test(std::vector<std::vector<double>> in)
{
std::vector<double> out;
long in_m = in.size();
long in_n = in[0].size();
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
// This just converts a vector-of-vectors into a col-first array
double* p_in = vec2d_to_colfirst_array(in);
double* p_ones = new double[in_n];
double* p_out = new double[in_m];
std::fill(p_ones, p_ones + in_n, 1.0);
double* dev_in;
double* dev_ones;
double* dev_out;
cudaStat = cudaMalloc((void**)&dev_in, in_m * in_n * sizeof(double));
cudaStat = cudaMalloc((void**)&dev_ones, in_n * sizeof(double));
cudaStat = cudaMalloc((void**)&dev_out, in_m * sizeof(double));
stat = cublasCreate(&handle);
cudaStat = cudaMemcpy(dev_in, p_in, in_m*in_n * sizeof(double), cudaMemcpyHostToDevice);
cudaStat = cudaMemcpy(dev_ones, p_ones, in_n * sizeof(double), cudaMemcpyHostToDevice);
double alpha = 1.0;
double beta = 0.0;
stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_out, 1);
cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
out.assign(p_out, p_out + in_m);
cudaFree(dev_in);
cudaFree(dev_ones);
cudaFree(dev_out);
cublasDestroy(handle);
free(p_in);
free(p_ones);
free(p_out);
return out;
}
int main(){
std::vector<double> a(idim2, 1.0);
std::vector<std::vector<double>> b;
for (int i = 0; i < idim1; i++) b.push_back(a);
std::vector<double> c = test(b);
for (int i = 0; i < c.size(); i++) std::cout << c[i] << ",";
std::cout << std::endl;
}
$ nvcc -std=c++11 -o t315 t315.cu -lcublas
t315.cu(24): warning: variable "cudaStat" was set but never used
t315.cu(25): warning: variable "stat" was set but never used
$ cuda-memcheck ./t315
========= CUDA-MEMCHECK
8,8,8,8,8,8,8,8,
========= ERROR SUMMARY: 0 errors
$
请注意,我认为 free()
与 new
一起使用时 API 不正确,但这似乎不是您问题的症结所在。
我第一次尝试 CUDA/cuBLAS,我尝试编写一个简单的函数,将 MxN 矩阵(用向量的向量表示,std::vector
)与 Nx1 "ones"向量,从而得到矩阵的行(?)和。这将利用 cublas_gemv()
加上其他基本 CUDA 操作,我认为这是一个很好的起点。
在处理了设置问题和 reading/copying 示例代码之后,我得到的是:
std::vector<double> test(std::vector<std::vector<double>> in)
{
std::vector<double> out;
long in_m = in.size();
long in_n = in[0].size();
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
// This just converts a vector-of-vectors into a col-first array
double* p_in = vec2d_to_colfirst_array(in);
double* p_ones = new double[in_n];
double* p_out = new double[in_m];
std::fill(p_ones, p_ones + in_n, 1.0);
double* dev_in;
double* dev_ones;
double* dev_out;
cudaStat = cudaMalloc((void**)&dev_in, in_m * in_n * sizeof(double));
cudaStat = cudaMalloc((void**)&dev_ones, in_n * sizeof(double));
cudaStat = cudaMalloc((void**)&dev_out, in_m * sizeof(double));
stat = cublasCreate(&handle);
cudaStat = cudaMemcpy(dev_in, p_in, in_m*in_n * sizeof(double), cudaMemcpyHostToDevice);
cudaStat = cudaMemcpy(dev_ones, p_ones, in_n * sizeof(double), cudaMemcpyHostToDevice);
double alpha = 1.0;
double beta = 0.0;
stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_ones, 1);
cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
out.assign(p_out, p_out + in_m);
cudaFree(dev_in);
cudaFree(dev_ones);
cudaFree(dev_out);
cublasDestroy(handle);
free(p_in);
free(p_ones);
free(p_out);
return out;
}
它看起来与我阅读的示例没有太大区别,所以我预计它会 "just work"。但是,当我检查 p_out
时,它全为零。我肯定没有输入零 in
矩阵。
我验证了 vec2d_to_colfirst_array()
工作正常,并且通过将数据从设备复制回主机然后读取,dev_in
/dev_ones
已正确填充。也许问题出在对 cublasDgemv()
的调用中,但由于我是新手(而且因为与 Eigen 等相比,BLAS 语法更不直观),在经历了很多挫折之后,我只是看不出出了什么问题。
感谢任何帮助!
错误看起来相当简单。您希望从 dev_out
:
cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
但是你从来没有在你的 cublas 调用中使用 dev_out
:
stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_ones, 1);
这似乎只是一个复制粘贴错误。如果您将 cublas 调用中的最后一个 dev_ones
实例替换为 dev_out
,您的代码适用于我:
stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_out, 1);
这是一个完整的示例,其中包含该更改:
$ cat t315.cu
#include <vector>
#include <cublas_v2.h>
#include <iostream>
const long idim1 = 8;
const long idim2 = 8;
double* vec2d_to_colfirst_array(std::vector<std::vector<double>> in){
long dim1 = in.size();
long dim2 = in[0].size();
long k = 0;
double *res = new double[dim1*dim2];
for (int i = 0; i < dim1; i++)
for (int j = 0; j < dim2; j++) res[k++] = in[i][j];
return res;
}
std::vector<double> test(std::vector<std::vector<double>> in)
{
std::vector<double> out;
long in_m = in.size();
long in_n = in[0].size();
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
// This just converts a vector-of-vectors into a col-first array
double* p_in = vec2d_to_colfirst_array(in);
double* p_ones = new double[in_n];
double* p_out = new double[in_m];
std::fill(p_ones, p_ones + in_n, 1.0);
double* dev_in;
double* dev_ones;
double* dev_out;
cudaStat = cudaMalloc((void**)&dev_in, in_m * in_n * sizeof(double));
cudaStat = cudaMalloc((void**)&dev_ones, in_n * sizeof(double));
cudaStat = cudaMalloc((void**)&dev_out, in_m * sizeof(double));
stat = cublasCreate(&handle);
cudaStat = cudaMemcpy(dev_in, p_in, in_m*in_n * sizeof(double), cudaMemcpyHostToDevice);
cudaStat = cudaMemcpy(dev_ones, p_ones, in_n * sizeof(double), cudaMemcpyHostToDevice);
double alpha = 1.0;
double beta = 0.0;
stat = cublasDgemv(handle, CUBLAS_OP_N, in_m, in_n, &alpha, dev_in, in_m, dev_ones, 1, &beta, dev_out, 1);
cudaStat = cudaMemcpy(p_out, dev_out, in_m * sizeof(double), cudaMemcpyDeviceToHost);
out.assign(p_out, p_out + in_m);
cudaFree(dev_in);
cudaFree(dev_ones);
cudaFree(dev_out);
cublasDestroy(handle);
free(p_in);
free(p_ones);
free(p_out);
return out;
}
int main(){
std::vector<double> a(idim2, 1.0);
std::vector<std::vector<double>> b;
for (int i = 0; i < idim1; i++) b.push_back(a);
std::vector<double> c = test(b);
for (int i = 0; i < c.size(); i++) std::cout << c[i] << ",";
std::cout << std::endl;
}
$ nvcc -std=c++11 -o t315 t315.cu -lcublas
t315.cu(24): warning: variable "cudaStat" was set but never used
t315.cu(25): warning: variable "stat" was set but never used
$ cuda-memcheck ./t315
========= CUDA-MEMCHECK
8,8,8,8,8,8,8,8,
========= ERROR SUMMARY: 0 errors
$
请注意,我认为 free()
与 new
一起使用时 API 不正确,但这似乎不是您问题的症结所在。