测试 CUDA 11 cusolverDnDSgels()
Testing CUDA 11 cusolverDnDSgels()
试图理解 cusolverDnDSgels 函数。如果我 运行 它与文档中的简单 3x3 示例一样有效,但是当我 运行 它与我的数据一起使用时 d_info returns -1 如文档所述,如果d_info = -i 那么第 i 个参数无效。
下面我用 3 x 3 和 4 x 3 矩阵发布了代码,其中前者有效而第二个无效。
作为参考我使用了这个网站计算器https://adrianstoll.com/linear-algebra/least-squares.html
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
void printMatrix(int m, int n, const double* A, int lda, const char* name)
{
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
double Areg = A[row + col * lda];
printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
}
}
}
int main(int argc, char*argv[])
{
// 3x3 example works fine
int m = 3;
int n = 3;
double A[9] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0 };
double B[3] = { 6.0, 15.0, 4.0 };
// 4x3 example d_info/info_gpu returns -1
//int m = 4;
//int n = 3;
//double A[12] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0, 5.0, 1.0, 2.0 };
//double B[4] = { 6.0, 15.0, 4.0, 5.0 };
double X[3];
int lda = m;
int ldb = m;
int ldx = n;
int nrhs = 1;
int niter = 0;
int info_gpu = 0;
size_t lwork = 0;
double *d_A = NULL;
double *d_B = NULL;
double *d_X = NULL;
double *d_work = NULL;
int* d_info = NULL;
cusolverDnHandle_t cusolverH = NULL;
cudaError_t cudaStat = cudaSuccess;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
cusolver_status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate space in the GPU
cudaStat = cudaMalloc((void**)&d_A, sizeof(double) * m * n);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_B, sizeof(double) * m * nrhs);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_X, sizeof(double) * n * nrhs);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat);
// Copy matrices into GPU space
cudaStat = cudaMemcpy(d_A, A, sizeof(double) * m * n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMemcpy(d_B, B, sizeof(double) * m * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat);
// Get work buffer size
cusolver_status = cusolverDnDSgels_bufferSize(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, &lwork);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate workspace
cudaStat = cudaMalloc((void**)&d_work, sizeof(float) * lwork);
assert(cudaSuccess == cudaStat);
// Run solver
cusolver_status = cusolverDnDSgels(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, lwork, &niter, d_info);
// Sync threads
cudaStat = cudaDeviceSynchronize();
assert(cudaSuccess == cudaStat);
// Copy GPU info
cudaStat = cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat);
// Get solved data
cudaStat = cudaMemcpy(X, d_X, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat);
printf("after DDgels: info_gpu = %d\n", info_gpu);
printMatrix(n, nrhs, X, ldx, "X");
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
if (d_A) cudaFree(d_A);
if (d_B) cudaFree(d_B);
if (d_X) cudaFree(d_X);
if (d_info) cudaFree(d_info);
if (d_work) cudaFree(d_work);
if (cusolverH) cusolverDnDestroy(cusolverH);
cudaDeviceReset();
return 0;
}
不幸的是,cuSolver 设置不一致导致了这个问题。
有一种方法可以通过调用专家 API "cusolverDnIRSXgels" "cusolverDnIRSXgels_bufferSize" 来避免此类问题,从而为用户提供更多控制权。
因此在您的代码中替换
cusolver_status = cusolverDnDDgels_bufferSize(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, &lwork);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate workspace
cudaStat = cudaMalloc((void**)&d_work, lwork);
assert(cudaSuccess == cudaStat);
// Run solver
cusolver_status = cusolverDnDDgels(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, lwork, &niter, d_info);
printf("gels status: %d\n", int(cusolver_status));
来自
// create the params and info structure for the expert interface
cusolverDnIRSParams_t gels_irs_params;
cusolverDnIRSParamsCreate( &gels_irs_params );
cusolverDnIRSInfos_t gels_irs_infos;
cusolverDnIRSInfosCreate( &gels_irs_infos );
// Set the main and the low precision of the solver DSgels
// D is for double S for single precision thus
// main_precision is CUSOLVER_R_FP64, low_precision is CUSOLVER_R_FP32
cusolverDnIRSParamsSetSolverPrecisions( gels_irs_params, CUSOLVER_R_64F, CUSOLVER_R_32F );
// Set the refinement solver.
cusolverDnIRSParamsSetRefinementSolver( gels_irs_params, CUSOLVER_IRS_REFINE_CLASSICAL );
// Get work buffer size
cusolver_status = cusolverDnIRSXgels_bufferSize(cusolverH, gels_irs_params, m, n, nrhs, &lwork);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate workspace
cudaStat = cudaMalloc((void**)&d_work, lwork);
assert(cudaSuccess == cudaStat);
// Run solver
cusolver_status = cusolverDnIRSXgels(cusolverH, gels_irs_params, gels_irs_infos, m, n, nrhs, (void *)d_A, lda, (void *)d_B, ldb, (void *)d_X, ldx, d_work, lwork, &niter, d_info);
printf("gels status: %d\n", int(cusolver_status));
另请注意,当m>n时为超额方程组,不能选择RHS再求SO,最好选择SOL,生成RHS=A*SOL,然后用RHS 并与 SOL 进行比较。
另请注意,LDX 应 >= max(m,n)
我通过以下方式修改了您的代码:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#define USE_BUG
typedef double mt;
#ifndef max
#define max(a, b) ((a) > (b) ? (a) : (b))
#endif
void matvec(int m, int n, int nrhs, const mt* A, int lda, mt *X, int ldx, mt *B, int ldb)
{
mt sum[nrhs];
for (int row = 0; row < m; row++) {
for (int r = 0; r < nrhs; r++) sum[r] = 0.0;
for (int col = 0; col < n; col++) {
for (int r = 0; r < nrhs; r++){
sum[r] += A[row + col * lda] * X[col + r*ldx];
}
}
for (int r = 0; r < nrhs; r++) B[row + r*ldb] = sum[r];
}
}
mt check_solution(int n, int nrhs, mt *ref, int ldr, mt *X, int ldx)
{
mt error=0.0;
for (int r = 0; r < nrhs; r++){
for (int i = 0; i < n; i++) {
error = max(error, abs(ref[i+r*ldr] - X[i+r*ldr]));
}
}
return error;
}
void printMatrix(int m, int n, const mt* A, int lda, const char* name)
{
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
mt Areg = A[row + col * lda];
printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
}
}
}
int main(int argc, char*argv[])
{
#ifndef USE_BUG
// 3x3 example works fine
const int m = 3;
const int n = 3;
mt A[m*n] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0 };
mt sol[n] = { 6.0, 15.0, 4.0 };
#else
// 4x3 example d_info/info_gpu returns -1
const int m = 4;
const int n = 3;
mt A[m*n] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0, 5.0, 1.0, 2.0 };
mt sol[n] = { 6.0, 15.0, 4.0 };
#endif
mt X[n];
mt B[m];
int lda = m;
int ldb = max(m,n);
int ldx = max(m,n);
int nrhs = 1;
int niter = 0;
int info_gpu = 0;
size_t lwork = 0;
mt *d_A = NULL;
mt *d_B = NULL;
mt *d_X = NULL;
mt *d_work = NULL;
int* d_info = NULL;
// compute B = A*sol
matvec(m, n, nrhs, A, lda, sol, ldx, B, ldb);
cusolverDnHandle_t cusolverH = NULL;
cudaError_t cudaStat = cudaSuccess;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
cusolver_status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate space in the GPU
cudaStat = cudaMalloc((void**)&d_A, sizeof(mt) * m * n);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_B, sizeof(mt) * m * nrhs);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_X, sizeof(mt) * n * nrhs);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat);
// Copy matrices into GPU space
cudaStat = cudaMemcpy(d_A, A, sizeof(mt) * m * n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMemcpy(d_B, B, sizeof(mt) * m * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat);
#if 1
// =======================================================
// create the params and info structure for the expert interface
cusolverDnIRSParams_t gels_irs_params;
cusolverDnIRSParamsCreate( &gels_irs_params );
cusolverDnIRSInfos_t gels_irs_infos;
cusolverDnIRSInfosCreate( &gels_irs_infos );
// Set the main and the low precision of the solver DSgels
// D is for double S for single precision thus
// main_precision is CUSOLVER_R_FP64, low_precision is CUSOLVER_R_FP32
cusolverDnIRSParamsSetSolverPrecisions( gels_irs_params, CUSOLVER_R_64F, CUSOLVER_R_32F );
// Set the refinement solver.
cusolverDnIRSParamsSetRefinementSolver( gels_irs_params, CUSOLVER_IRS_REFINE_CLASSICAL );
// Get work buffer size
cusolver_status = cusolverDnIRSXgels_bufferSize(cusolverH, gels_irs_params, m, n, nrhs, &lwork);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate workspace
cudaStat = cudaMalloc((void**)&d_work, lwork);
assert(cudaSuccess == cudaStat);
// Run solver
cusolver_status = cusolverDnIRSXgels(cusolverH, gels_irs_params, gels_irs_infos, m, n, nrhs, (void *)d_A, lda, (void *)d_B, ldb, (void *)d_X, ldx, d_work, lwork, &niter, d_info);
printf("gels status: %d\n", int(cusolver_status));
#else
// Get work buffer size
cusolver_status = cusolverDnDDgels_bufferSize(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, &lwork);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate workspace
cudaStat = cudaMalloc((void**)&d_work, lwork);
assert(cudaSuccess == cudaStat);
// Run solver
cusolver_status = cusolverDnDDgels(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, lwork, &niter, d_info);
printf("gels status: %d\n", int(cusolver_status));
#endif
// Sync threads
cudaStat = cudaDeviceSynchronize();
assert(cudaSuccess == cudaStat);
// Copy GPU info
cudaStat = cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat);
// Get solved data
cudaStat = cudaMemcpy(X, d_X, sizeof(mt) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat);
printf("after gels: info_gpu = %d\n", info_gpu);
printf("after gels: niter = %d\n", niter);
printf("after gels: error = %e\n", check_solution(n, nrhs, sol, ldx, X, ldx));
printMatrix(3, nrhs, X, ldx, "X");
if (d_A) cudaFree(d_A);
if (d_B) cudaFree(d_B);
if (d_X) cudaFree(d_X);
if (d_info) cudaFree(d_info);
if (d_work) cudaFree(d_work);
if (cusolverH) cusolverDnDestroy(cusolverH);
cudaDeviceReset();
return 0;
}
使用 nvcc -o test test.cu -lcusolver
编译
试图理解 cusolverDnDSgels 函数。如果我 运行 它与文档中的简单 3x3 示例一样有效,但是当我 运行 它与我的数据一起使用时 d_info returns -1 如文档所述,如果d_info = -i 那么第 i 个参数无效。
下面我用 3 x 3 和 4 x 3 矩阵发布了代码,其中前者有效而第二个无效。
作为参考我使用了这个网站计算器https://adrianstoll.com/linear-algebra/least-squares.html
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
void printMatrix(int m, int n, const double* A, int lda, const char* name)
{
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
double Areg = A[row + col * lda];
printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
}
}
}
int main(int argc, char*argv[])
{
// 3x3 example works fine
int m = 3;
int n = 3;
double A[9] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0 };
double B[3] = { 6.0, 15.0, 4.0 };
// 4x3 example d_info/info_gpu returns -1
//int m = 4;
//int n = 3;
//double A[12] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0, 5.0, 1.0, 2.0 };
//double B[4] = { 6.0, 15.0, 4.0, 5.0 };
double X[3];
int lda = m;
int ldb = m;
int ldx = n;
int nrhs = 1;
int niter = 0;
int info_gpu = 0;
size_t lwork = 0;
double *d_A = NULL;
double *d_B = NULL;
double *d_X = NULL;
double *d_work = NULL;
int* d_info = NULL;
cusolverDnHandle_t cusolverH = NULL;
cudaError_t cudaStat = cudaSuccess;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
cusolver_status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate space in the GPU
cudaStat = cudaMalloc((void**)&d_A, sizeof(double) * m * n);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_B, sizeof(double) * m * nrhs);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_X, sizeof(double) * n * nrhs);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat);
// Copy matrices into GPU space
cudaStat = cudaMemcpy(d_A, A, sizeof(double) * m * n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMemcpy(d_B, B, sizeof(double) * m * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat);
// Get work buffer size
cusolver_status = cusolverDnDSgels_bufferSize(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, &lwork);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate workspace
cudaStat = cudaMalloc((void**)&d_work, sizeof(float) * lwork);
assert(cudaSuccess == cudaStat);
// Run solver
cusolver_status = cusolverDnDSgels(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, lwork, &niter, d_info);
// Sync threads
cudaStat = cudaDeviceSynchronize();
assert(cudaSuccess == cudaStat);
// Copy GPU info
cudaStat = cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat);
// Get solved data
cudaStat = cudaMemcpy(X, d_X, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat);
printf("after DDgels: info_gpu = %d\n", info_gpu);
printMatrix(n, nrhs, X, ldx, "X");
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
if (d_A) cudaFree(d_A);
if (d_B) cudaFree(d_B);
if (d_X) cudaFree(d_X);
if (d_info) cudaFree(d_info);
if (d_work) cudaFree(d_work);
if (cusolverH) cusolverDnDestroy(cusolverH);
cudaDeviceReset();
return 0;
}
不幸的是,cuSolver 设置不一致导致了这个问题。 有一种方法可以通过调用专家 API "cusolverDnIRSXgels" "cusolverDnIRSXgels_bufferSize" 来避免此类问题,从而为用户提供更多控制权。
因此在您的代码中替换
cusolver_status = cusolverDnDDgels_bufferSize(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, &lwork);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate workspace
cudaStat = cudaMalloc((void**)&d_work, lwork);
assert(cudaSuccess == cudaStat);
// Run solver
cusolver_status = cusolverDnDDgels(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, lwork, &niter, d_info);
printf("gels status: %d\n", int(cusolver_status));
来自
// create the params and info structure for the expert interface
cusolverDnIRSParams_t gels_irs_params;
cusolverDnIRSParamsCreate( &gels_irs_params );
cusolverDnIRSInfos_t gels_irs_infos;
cusolverDnIRSInfosCreate( &gels_irs_infos );
// Set the main and the low precision of the solver DSgels
// D is for double S for single precision thus
// main_precision is CUSOLVER_R_FP64, low_precision is CUSOLVER_R_FP32
cusolverDnIRSParamsSetSolverPrecisions( gels_irs_params, CUSOLVER_R_64F, CUSOLVER_R_32F );
// Set the refinement solver.
cusolverDnIRSParamsSetRefinementSolver( gels_irs_params, CUSOLVER_IRS_REFINE_CLASSICAL );
// Get work buffer size
cusolver_status = cusolverDnIRSXgels_bufferSize(cusolverH, gels_irs_params, m, n, nrhs, &lwork);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate workspace
cudaStat = cudaMalloc((void**)&d_work, lwork);
assert(cudaSuccess == cudaStat);
// Run solver
cusolver_status = cusolverDnIRSXgels(cusolverH, gels_irs_params, gels_irs_infos, m, n, nrhs, (void *)d_A, lda, (void *)d_B, ldb, (void *)d_X, ldx, d_work, lwork, &niter, d_info);
printf("gels status: %d\n", int(cusolver_status));
另请注意,当m>n时为超额方程组,不能选择RHS再求SO,最好选择SOL,生成RHS=A*SOL,然后用RHS 并与 SOL 进行比较。
另请注意,LDX 应 >= max(m,n)
我通过以下方式修改了您的代码:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#define USE_BUG
typedef double mt;
#ifndef max
#define max(a, b) ((a) > (b) ? (a) : (b))
#endif
void matvec(int m, int n, int nrhs, const mt* A, int lda, mt *X, int ldx, mt *B, int ldb)
{
mt sum[nrhs];
for (int row = 0; row < m; row++) {
for (int r = 0; r < nrhs; r++) sum[r] = 0.0;
for (int col = 0; col < n; col++) {
for (int r = 0; r < nrhs; r++){
sum[r] += A[row + col * lda] * X[col + r*ldx];
}
}
for (int r = 0; r < nrhs; r++) B[row + r*ldb] = sum[r];
}
}
mt check_solution(int n, int nrhs, mt *ref, int ldr, mt *X, int ldx)
{
mt error=0.0;
for (int r = 0; r < nrhs; r++){
for (int i = 0; i < n; i++) {
error = max(error, abs(ref[i+r*ldr] - X[i+r*ldr]));
}
}
return error;
}
void printMatrix(int m, int n, const mt* A, int lda, const char* name)
{
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
mt Areg = A[row + col * lda];
printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
}
}
}
int main(int argc, char*argv[])
{
#ifndef USE_BUG
// 3x3 example works fine
const int m = 3;
const int n = 3;
mt A[m*n] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0 };
mt sol[n] = { 6.0, 15.0, 4.0 };
#else
// 4x3 example d_info/info_gpu returns -1
const int m = 4;
const int n = 3;
mt A[m*n] = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0, 3.0, 6.0, 1.0, 5.0, 1.0, 2.0 };
mt sol[n] = { 6.0, 15.0, 4.0 };
#endif
mt X[n];
mt B[m];
int lda = m;
int ldb = max(m,n);
int ldx = max(m,n);
int nrhs = 1;
int niter = 0;
int info_gpu = 0;
size_t lwork = 0;
mt *d_A = NULL;
mt *d_B = NULL;
mt *d_X = NULL;
mt *d_work = NULL;
int* d_info = NULL;
// compute B = A*sol
matvec(m, n, nrhs, A, lda, sol, ldx, B, ldb);
cusolverDnHandle_t cusolverH = NULL;
cudaError_t cudaStat = cudaSuccess;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
cusolver_status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate space in the GPU
cudaStat = cudaMalloc((void**)&d_A, sizeof(mt) * m * n);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_B, sizeof(mt) * m * nrhs);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_X, sizeof(mt) * n * nrhs);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMalloc((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat);
// Copy matrices into GPU space
cudaStat = cudaMemcpy(d_A, A, sizeof(mt) * m * n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMemcpy(d_B, B, sizeof(mt) * m * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat);
#if 1
// =======================================================
// create the params and info structure for the expert interface
cusolverDnIRSParams_t gels_irs_params;
cusolverDnIRSParamsCreate( &gels_irs_params );
cusolverDnIRSInfos_t gels_irs_infos;
cusolverDnIRSInfosCreate( &gels_irs_infos );
// Set the main and the low precision of the solver DSgels
// D is for double S for single precision thus
// main_precision is CUSOLVER_R_FP64, low_precision is CUSOLVER_R_FP32
cusolverDnIRSParamsSetSolverPrecisions( gels_irs_params, CUSOLVER_R_64F, CUSOLVER_R_32F );
// Set the refinement solver.
cusolverDnIRSParamsSetRefinementSolver( gels_irs_params, CUSOLVER_IRS_REFINE_CLASSICAL );
// Get work buffer size
cusolver_status = cusolverDnIRSXgels_bufferSize(cusolverH, gels_irs_params, m, n, nrhs, &lwork);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate workspace
cudaStat = cudaMalloc((void**)&d_work, lwork);
assert(cudaSuccess == cudaStat);
// Run solver
cusolver_status = cusolverDnIRSXgels(cusolverH, gels_irs_params, gels_irs_infos, m, n, nrhs, (void *)d_A, lda, (void *)d_B, ldb, (void *)d_X, ldx, d_work, lwork, &niter, d_info);
printf("gels status: %d\n", int(cusolver_status));
#else
// Get work buffer size
cusolver_status = cusolverDnDDgels_bufferSize(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, &lwork);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// Allocate workspace
cudaStat = cudaMalloc((void**)&d_work, lwork);
assert(cudaSuccess == cudaStat);
// Run solver
cusolver_status = cusolverDnDDgels(cusolverH, m, n, nrhs, d_A, lda, d_B, ldb, d_X, ldx, d_work, lwork, &niter, d_info);
printf("gels status: %d\n", int(cusolver_status));
#endif
// Sync threads
cudaStat = cudaDeviceSynchronize();
assert(cudaSuccess == cudaStat);
// Copy GPU info
cudaStat = cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat);
// Get solved data
cudaStat = cudaMemcpy(X, d_X, sizeof(mt) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat);
printf("after gels: info_gpu = %d\n", info_gpu);
printf("after gels: niter = %d\n", niter);
printf("after gels: error = %e\n", check_solution(n, nrhs, sol, ldx, X, ldx));
printMatrix(3, nrhs, X, ldx, "X");
if (d_A) cudaFree(d_A);
if (d_B) cudaFree(d_B);
if (d_X) cudaFree(d_X);
if (d_info) cudaFree(d_info);
if (d_work) cudaFree(d_work);
if (cusolverH) cusolverDnDestroy(cusolverH);
cudaDeviceReset();
return 0;
}
使用 nvcc -o test test.cu -lcusolver
编译