cuSolverDnDgetrf 不工作的 cuSolver 样本
cuSolver sample with cuSolverDnDgetrf not working
好的。我正在用从 cuSolver 示例中获取的一些代码弄脏我的手。
我对 C++ 经验不足,但我设法从原始代码中获取了我需要的东西。
问题是当我尝试执行它时;按照参考手册的建议,我编译时使用:
nvcc -c att3_cus_lu.cpp -I/usr/local/cuda-8.0/targets/x86_64-linux/include
g++ -fopenmp -o res.out att3_cus_lu.o -L/usr/local/cuda/lib64 -lcudart -lcublas -lcusolver -lcusparse
到这里没问题;我得到的输出总是一样的:
step 1: set matrix (A) and right hand side vector (b)
step 2: prepare data on device
step 3: solve A*x = b
Error: LU factorization failed
INFO_VALUE = 2
timing: LU = 0.000208 sec
step 4: evaluate residual
|b - A*x| = NAN
|A| = 1.000000E+00
|x| = NAN
|b - A*x|/(|A|*|x|) = NAN
无论我输入什么矩阵或b向量,结果都是一样的;代码无法分解矩阵。
我已经展示了 INFO_VALUE 每次执行时其值始终为 2;它是为 cuSolverDnDgetrf() 函数请求的 info
变量的值。在 cuSolver 参考手册中它说:
This function computes the LU factorization of a m×n matrix PA = LU
where A is a m×n matrix, P is a permutation matrix, L is a lower triangular matrix with unit diagonal, and U is an upper triangular matrix.
If LU factorization failed, i.e. matrix A(U) is singular, The output parameter devInfo=i indicates U(i,i) = 0.
在下面的代码中,我放置了相同的矩阵,因此 运行 周围没有奇异矩阵。
这是完整的代码; main() cuda 代码的模式是重复的:定义主机变量,cudaMemcpy 它们到设备,使用 cuda 函数在设备上执行它们,cudaMemcpy 它们回到主机,然后继续串行代码直到你重复。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cuda.h"
#include "helper_cusolver.h"
#ifndef MAX
#define MAX(a,b) (a > b) ? a : b
#endif
void linearSolverLU(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int *ipiv = NULL; // pivoting sequence
int h_info = 0;
double start, stop;
double time_solve;
checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n,(double*)Acopy, lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int)*n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if ( 0 != h_info ){
fprintf(stderr, "Error: LU factorization failed\n");
printf("INFO_VALUE = %d\n",h_info);
}
checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf (stdout, "timing: LU = %10.6f sec\n", time_solve);
if (info ) { checkCudaErrors(cudaFree(info )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A ) { checkCudaErrors(cudaFree(A)); }
if (ipiv ) { checkCudaErrors(cudaFree(ipiv));}
}
//int main (int argc, char *argv[]) !!!
int main(void)
{
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
cudaStream_t stream = NULL;
int rowsA = 3; // number of rows of A
int colsA = 3; // number of columns of A
int lda = MAX(colsA, rowsA); // leading dimension in dense matrix
double *h_A = NULL; // dense matrix
double *h_x = NULL; // a copy of d_x
double *h_b = NULL; // b = ones(m,1)
double *h_r = NULL; // r = b - A*x, a copy of d_r
double *d_A = NULL; // a copy of h_A
double *d_x = NULL; // x = A \ b
double *d_b = NULL; // a copy of h_b
double *d_r = NULL; // r = b - A*x
// the constants are used in residual evaluation, r = b - A*x
const double minus_one = -1.0;
const double one = 1.0;
double x_inf = 0.0;
double r_inf = 0.0;
double A_inf = 0.0;
int errors = 0;
int i, j, col, row, k;
h_A = (double*)malloc(sizeof(double)*lda*colsA);
h_x = (double*)malloc(sizeof(double)*colsA);
h_b = (double*)malloc(sizeof(double)*rowsA);
h_r = (double*)malloc(sizeof(double)*rowsA);
assert(NULL != h_A);
assert(NULL != h_x);
assert(NULL != h_b);
assert(NULL != h_r);
memset(h_A, 0., sizeof(double)*lda*colsA);
printf("step 1: set matrix (A) and right hand side vector (b)\n");
double mat[9] = {1.,0.,0.,0.,1.,0.,0.,0.,1.};
double bb[3] = {1., 1., 1.}; //RANDOM MATRICES 4 TESTING
for( row =0; row < rowsA ; row++ )
{
for( col = 0; col < colsA ; col++ )
{
h_A[row*lda + col] = mat[row];
}
}
for( k = 0; k < rowsA; k++ ){
h_b[k] = bb[k];
}
checkCudaErrors(cusolverDnCreate(&handle));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cusolverDnSetStream(handle, stream));
checkCudaErrors(cublasSetStream(cublasHandle, stream));
checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*colsA));
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*rowsA));
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rowsA));
printf("step 2: prepare data on device\n");
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double)*lda*colsA, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));
printf("step 3: solve A*x = b \n");
linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
printf("step 4: evaluate residual\n");
checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*rowsA, cudaMemcpyDeviceToDevice));
// r = b - A*x
checkCudaErrors(cublasDgemm_v2(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
rowsA,
1,
colsA,
&minus_one,
d_A,
lda,
d_x,
rowsA,
&one,
d_r,
rowsA));
checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*colsA, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));
x_inf = vec_norminf(colsA, h_x);
r_inf = vec_norminf(rowsA, h_r);
A_inf = mat_norminf(rowsA, colsA, h_A, lda);
printf("|b - A*x| = %E \n", r_inf);
printf("|A| = %E \n", A_inf);
printf("|x| = %E \n", x_inf);
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));
if (handle) { checkCudaErrors(cusolverDnDestroy(handle)); }
if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
if (stream) { checkCudaErrors(cudaStreamDestroy(stream)); }
if (h_A) { free(h_A); }
if (h_x) { free(h_x); }
if (h_b) { free(h_b); }
if (h_r) { free(h_r); }
if (d_A) { checkCudaErrors(cudaFree(d_A)); }
if (d_x) { checkCudaErrors(cudaFree(d_x)); }
if (d_b) { checkCudaErrors(cudaFree(d_b)); }
if (d_r) { checkCudaErrors(cudaFree(d_r)); }
cudaDeviceReset();
return 0;
}
就是这样。我知道这是很多东西;任何帮助,将不胜感激。
我真的不明白我对这些矩阵做错了什么。
如果我应该添加一些其他相关信息,请告诉我。
NB 在原始代码中请求扩展名为 .mtx 的外部稀疏矩阵,然后在 linearSolverLU 函数中将其转换为密集矩阵。我删除了那些东西,现在我直接尝试用密集矩阵来求解线性系统。
您提交给 cuSolver 进行因式分解的矩阵无效。该库正确地报告了矩阵条目中的错误。罪魁祸首是这段代码:
for( row =0; row < rowsA ; row++ )
{
for( col = 0; col < colsA ; col++ )
{
h_A[row*lda + col] = mat[row];
}
}
这将生成包含 { 1, 1, 1, 0, 0, 0, 0, 0, 0 }
的 h_A
,它是单数的,并且(我假设)不是您的意图。
好的。我正在用从 cuSolver 示例中获取的一些代码弄脏我的手。 我对 C++ 经验不足,但我设法从原始代码中获取了我需要的东西。
问题是当我尝试执行它时;按照参考手册的建议,我编译时使用:
nvcc -c att3_cus_lu.cpp -I/usr/local/cuda-8.0/targets/x86_64-linux/include
g++ -fopenmp -o res.out att3_cus_lu.o -L/usr/local/cuda/lib64 -lcudart -lcublas -lcusolver -lcusparse
到这里没问题;我得到的输出总是一样的:
step 1: set matrix (A) and right hand side vector (b)
step 2: prepare data on device
step 3: solve A*x = b
Error: LU factorization failed
INFO_VALUE = 2
timing: LU = 0.000208 sec
step 4: evaluate residual
|b - A*x| = NAN
|A| = 1.000000E+00
|x| = NAN
|b - A*x|/(|A|*|x|) = NAN
无论我输入什么矩阵或b向量,结果都是一样的;代码无法分解矩阵。
我已经展示了 INFO_VALUE 每次执行时其值始终为 2;它是为 cuSolverDnDgetrf() 函数请求的 info
变量的值。在 cuSolver 参考手册中它说:
This function computes the LU factorization of a m×n matrix PA = LU where A is a m×n matrix, P is a permutation matrix, L is a lower triangular matrix with unit diagonal, and U is an upper triangular matrix. If LU factorization failed, i.e. matrix A(U) is singular, The output parameter devInfo=i indicates U(i,i) = 0.
在下面的代码中,我放置了相同的矩阵,因此 运行 周围没有奇异矩阵。
这是完整的代码; main() cuda 代码的模式是重复的:定义主机变量,cudaMemcpy 它们到设备,使用 cuda 函数在设备上执行它们,cudaMemcpy 它们回到主机,然后继续串行代码直到你重复。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "helper_cuda.h"
#include "helper_cusolver.h"
#ifndef MAX
#define MAX(a,b) (a > b) ? a : b
#endif
void linearSolverLU(
cusolverDnHandle_t handle,
int n,
const double *Acopy,
int lda,
const double *b,
double *x)
{
int bufferSize = 0;
int *info = NULL;
double *buffer = NULL;
double *A = NULL;
int *ipiv = NULL; // pivoting sequence
int h_info = 0;
double start, stop;
double time_solve;
checkCudaErrors(cusolverDnDgetrf_bufferSize(handle, n, n,(double*)Acopy, lda, &bufferSize));
checkCudaErrors(cudaMalloc(&info, sizeof(int)));
checkCudaErrors(cudaMalloc(&buffer, sizeof(double)*bufferSize));
checkCudaErrors(cudaMalloc(&A, sizeof(double)*lda*n));
checkCudaErrors(cudaMalloc(&ipiv, sizeof(int)*n));
// prepare a copy of A because getrf will overwrite A with L
checkCudaErrors(cudaMemcpy(A, Acopy, sizeof(double)*lda*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemset(info, 0, sizeof(int)));
start = second();
checkCudaErrors(cusolverDnDgetrf(handle, n, n, A, lda, buffer, ipiv, info));
checkCudaErrors(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if ( 0 != h_info ){
fprintf(stderr, "Error: LU factorization failed\n");
printf("INFO_VALUE = %d\n",h_info);
}
checkCudaErrors(cudaMemcpy(x, b, sizeof(double)*n, cudaMemcpyDeviceToDevice));
checkCudaErrors(cusolverDnDgetrs(handle, CUBLAS_OP_N, n, 1, A, lda, ipiv, x, n, info));
checkCudaErrors(cudaDeviceSynchronize());
stop = second();
time_solve = stop - start;
fprintf (stdout, "timing: LU = %10.6f sec\n", time_solve);
if (info ) { checkCudaErrors(cudaFree(info )); }
if (buffer) { checkCudaErrors(cudaFree(buffer)); }
if (A ) { checkCudaErrors(cudaFree(A)); }
if (ipiv ) { checkCudaErrors(cudaFree(ipiv));}
}
//int main (int argc, char *argv[]) !!!
int main(void)
{
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
cudaStream_t stream = NULL;
int rowsA = 3; // number of rows of A
int colsA = 3; // number of columns of A
int lda = MAX(colsA, rowsA); // leading dimension in dense matrix
double *h_A = NULL; // dense matrix
double *h_x = NULL; // a copy of d_x
double *h_b = NULL; // b = ones(m,1)
double *h_r = NULL; // r = b - A*x, a copy of d_r
double *d_A = NULL; // a copy of h_A
double *d_x = NULL; // x = A \ b
double *d_b = NULL; // a copy of h_b
double *d_r = NULL; // r = b - A*x
// the constants are used in residual evaluation, r = b - A*x
const double minus_one = -1.0;
const double one = 1.0;
double x_inf = 0.0;
double r_inf = 0.0;
double A_inf = 0.0;
int errors = 0;
int i, j, col, row, k;
h_A = (double*)malloc(sizeof(double)*lda*colsA);
h_x = (double*)malloc(sizeof(double)*colsA);
h_b = (double*)malloc(sizeof(double)*rowsA);
h_r = (double*)malloc(sizeof(double)*rowsA);
assert(NULL != h_A);
assert(NULL != h_x);
assert(NULL != h_b);
assert(NULL != h_r);
memset(h_A, 0., sizeof(double)*lda*colsA);
printf("step 1: set matrix (A) and right hand side vector (b)\n");
double mat[9] = {1.,0.,0.,0.,1.,0.,0.,0.,1.};
double bb[3] = {1., 1., 1.}; //RANDOM MATRICES 4 TESTING
for( row =0; row < rowsA ; row++ )
{
for( col = 0; col < colsA ; col++ )
{
h_A[row*lda + col] = mat[row];
}
}
for( k = 0; k < rowsA; k++ ){
h_b[k] = bb[k];
}
checkCudaErrors(cusolverDnCreate(&handle));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(cusolverDnSetStream(handle, stream));
checkCudaErrors(cublasSetStream(cublasHandle, stream));
checkCudaErrors(cudaMalloc((void **)&d_A, sizeof(double)*lda*colsA));
checkCudaErrors(cudaMalloc((void **)&d_x, sizeof(double)*colsA));
checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(double)*rowsA));
checkCudaErrors(cudaMalloc((void **)&d_r, sizeof(double)*rowsA));
printf("step 2: prepare data on device\n");
checkCudaErrors(cudaMemcpy(d_A, h_A, sizeof(double)*lda*colsA, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_b, h_b, sizeof(double)*rowsA, cudaMemcpyHostToDevice));
printf("step 3: solve A*x = b \n");
linearSolverLU(handle, rowsA, d_A, lda, d_b, d_x);
printf("step 4: evaluate residual\n");
checkCudaErrors(cudaMemcpy(d_r, d_b, sizeof(double)*rowsA, cudaMemcpyDeviceToDevice));
// r = b - A*x
checkCudaErrors(cublasDgemm_v2(
cublasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
rowsA,
1,
colsA,
&minus_one,
d_A,
lda,
d_x,
rowsA,
&one,
d_r,
rowsA));
checkCudaErrors(cudaMemcpy(h_x, d_x, sizeof(double)*colsA, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_r, d_r, sizeof(double)*rowsA, cudaMemcpyDeviceToHost));
x_inf = vec_norminf(colsA, h_x);
r_inf = vec_norminf(rowsA, h_r);
A_inf = mat_norminf(rowsA, colsA, h_A, lda);
printf("|b - A*x| = %E \n", r_inf);
printf("|A| = %E \n", A_inf);
printf("|x| = %E \n", x_inf);
printf("|b - A*x|/(|A|*|x|) = %E \n", r_inf/(A_inf * x_inf));
if (handle) { checkCudaErrors(cusolverDnDestroy(handle)); }
if (cublasHandle) { checkCudaErrors(cublasDestroy(cublasHandle)); }
if (stream) { checkCudaErrors(cudaStreamDestroy(stream)); }
if (h_A) { free(h_A); }
if (h_x) { free(h_x); }
if (h_b) { free(h_b); }
if (h_r) { free(h_r); }
if (d_A) { checkCudaErrors(cudaFree(d_A)); }
if (d_x) { checkCudaErrors(cudaFree(d_x)); }
if (d_b) { checkCudaErrors(cudaFree(d_b)); }
if (d_r) { checkCudaErrors(cudaFree(d_r)); }
cudaDeviceReset();
return 0;
}
就是这样。我知道这是很多东西;任何帮助,将不胜感激。 我真的不明白我对这些矩阵做错了什么。
如果我应该添加一些其他相关信息,请告诉我。
NB 在原始代码中请求扩展名为 .mtx 的外部稀疏矩阵,然后在 linearSolverLU 函数中将其转换为密集矩阵。我删除了那些东西,现在我直接尝试用密集矩阵来求解线性系统。
您提交给 cuSolver 进行因式分解的矩阵无效。该库正确地报告了矩阵条目中的错误。罪魁祸首是这段代码:
for( row =0; row < rowsA ; row++ )
{
for( col = 0; col < colsA ; col++ )
{
h_A[row*lda + col] = mat[row];
}
}
这将生成包含 { 1, 1, 1, 0, 0, 0, 0, 0, 0 }
的 h_A
,它是单数的,并且(我假设)不是您的意图。