如何格式化CUBLAS例程cublasdtbsv的A矩阵?
How to format the A matrix for CUBLAS routine cublasdtbsv?
我刚开始使用 Cuda 库,想求解对称带状矩阵方程。我找到了使用 LU 分解来解决此问题的示例代码。我现在正在尝试使用 cudaBlas 例程 cublasdtbsv 来求解方程。我无法找到此功能的示例代码并整理了我自己的解决方案。我认为我遇到的问题是我不明白为此例程输入 A 矩阵的正确方法。这是我的示例代码,用于一个非常简单的 3x3 矩阵,其中一个是右手边。它包括使用 LU 分解的正确解决方案和我尝试使用 cublasdtbsv 例程:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
void test();
void printMatrix2(int m, int n, const double* A, int lda, const char* name);
int LUFactorizationSolver2();
int TriBandedSymSolver2();
int main(int argc, char* argv[])
{
test();
return 0;
}
void test()
{
LUFactorizationSolver2();
TriBandedSymSolver2();
return;
}
int TriBandedSymSolver2()
{
printf("\n**** example of cublasDtbsv \n\n");
const int n = 3;
const int ldm = n;
const int k = 1;// n - 1;
const int lda = n;
const int nrhs = 1;
const int incx = 1;
double M[ldm * n] = { 1.0, 0.0, 0.0
, 0.0, 2.0, 3.0
, 0.0, 3.0, 4.0
};
double A[lda * n] = { 1.0, 2.0, 4.0
, 0.0, 3.0, 0.0
, 0.0, 0.0, 0.0
};
double x[n * nrhs] = { 00.0
, 40.0
, 00.0
};
cublasHandle_t cublasHandle = NULL;
cudaStream_t stream = NULL;
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
double* d_A = NULL; /* device copy of A */
double* d_x = NULL; /* device copy of x */
printf("example of tbsv \n");
printf("A = (matlab base-1)\n");
printMatrix2(n, n, A, lda, "A");
printf("=====\n");
printf("x (b) = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
cublasStatus = cublasCreate(&cublasHandle);
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * n);
cudaStat2 = cudaMalloc((void**)&d_x, sizeof(double) * n * nrhs);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * n, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_x, x, sizeof(double) * n * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/*
* step 5: solve A*x = b
*
*/
cublasStatus = cublasDtbsv(cublasHandle
, CUBLAS_FILL_MODE_LOWER
, CUBLAS_OP_N
, CUBLAS_DIAG_NON_UNIT
, n
, k
, d_A
, lda
, d_x
, incx
);
cudaStat1 = cudaDeviceSynchronize();
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
cudaStat1 = cudaMemcpy(x, d_x, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_x) cudaFree(d_x);
if (cublasHandle) cublasDestroy(cublasHandle);
if (stream) cudaStreamDestroy(stream);
cudaDeviceReset();
return 0;
}
int LUFactorizationSolver2()
{
printf("\n**** example of cusolverDnDgetrs \n\n");
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
const int m = 3;
const int lda = m;
const int nrhs = 1; // number of right-hand sides
const int ldb = m;
double A[lda * m] = { 1.0, 0.0, 0.0
, 0.0, 2.0, 3.0
, 0.0, 3.0, 4.0
};
double B[m * nrhs] = { 00.0
, 40.0
, 00.0
};
double X[m * nrhs]; /* X = A\B */
double LU[lda * m]; /* L and U */
int Ipiv[m]; /* host copy of pivoting sequence */
int info = 0; /* host copy of error info */
double* d_A = NULL; /* device copy of A */
double* d_B = NULL; /* device copy of B */
int* d_Ipiv = NULL; /* pivoting sequence */
int* d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
double* d_work = NULL; /* device workspace for getrf */
const int pivot_on = 0; // 1;
if (pivot_on) {
printf("pivot is on : compute P*A = L*U \n");
}
else {
printf("pivot is off: compute A = L*U (not numerically stable)\n");
}
printf("A = (matlab base-1)\n");
printMatrix2(m, m, A, lda, "A");
printf("=====\n");
printf("B = (matlab base-1)\n");
printMatrix2(m, nrhs, B, ldb, "B");
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * m);
cudaStat2 = cudaMalloc((void**)&d_B, sizeof(double) * m * nrhs);
cudaStat2 = cudaMalloc((void**)&d_Ipiv, sizeof(int) * m);
cudaStat4 = cudaMalloc((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * m, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_B, B, sizeof(double) * m * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/* step 3: query working space of getrf */
status = cusolverDnDgetrf_bufferSize(
cusolverH,
m,
m,
d_A,
lda,
&lwork);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double) * lwork);
assert(cudaSuccess == cudaStat1);
/* step 4: LU factorization */
if (pivot_on) {
status = cusolverDnDgetrf(
cusolverH,
m,
m,
d_A,
lda,
d_work,
d_Ipiv,
d_info);
}
else {
status = cusolverDnDgetrf(
cusolverH,
m,
m,
d_A,
lda,
d_work,
NULL,
d_info);
}
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
if (pivot_on) {
cudaStat1 = cudaMemcpy(Ipiv, d_Ipiv, sizeof(int) * m, cudaMemcpyDeviceToHost);
}
cudaStat2 = cudaMemcpy(LU, d_A, sizeof(double) * lda * m, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
if (0 > info) {
printf("%d-th parameter is wrong \n", -info);
exit(1);
}
if (pivot_on) {
printf("pivoting sequence, matlab base-1\n");
for (int j = 0; j < m; j++) {
printf("Ipiv(%d) = %d\n", j + 1, Ipiv[j]);
}
}
printf("L and U = (matlab base-1)\n");
printMatrix2(m, m, LU, lda, "LU");
printf("=====\n");
/*
* step 5: solve A*X = B
*
*/
if (pivot_on) {
status = cusolverDnDgetrs(
cusolverH,
CUBLAS_OP_N,
m,
nrhs, /* nrhs */
d_A,
lda,
d_Ipiv,
d_B,
ldb,
d_info);
}
else {
status = cusolverDnDgetrs(
cusolverH,
CUBLAS_OP_N,
m,
nrhs, /* nrhs */
d_A,
lda,
NULL,
d_B,
ldb,
d_info);
}
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(X, d_B, sizeof(double) * m * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(m, nrhs, X, ldb, "X");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_B) cudaFree(d_B);
if (d_Ipiv) cudaFree(d_Ipiv);
if (d_info) cudaFree(d_info);
if (d_work) cudaFree(d_work);
if (cusolverH) cusolverDnDestroy(cusolverH);
if (stream) cudaStreamDestroy(stream);
cudaDeviceReset();
return 0;
}
void printMatrix2(int m, int n, const double* A, int lda, const char* name)
{
printf("%18s", "");
for (int col = 0; col < n; col++) { printf("%7s(*,%2d) ", name, col + 1); }
printf("\n");
for (int row = 0; row < m; row++) {
printf("%4s(%2d,*) = ", name, row + 1);
for (int col = 0; col < n; col++) {
double Areg = A[row + col * lda];
printf("%20.9f", Areg);
}
printf("\n");
}
return;
}
正确答案应该是:
0.00
-160.00
120.00
但我得到:
0.000000000
inf
-inf
我正在 Windows 10 使用 Visual Studio 2019 开发这个。
我遗漏了什么或者有人可以为我指出 cublasDtbsv 例程的工作示例吗?
CUBLAS tbsv is a banded triangular solver. It expects your M
matrix to be banded and triangular. If you would like to see what that looks like, this 是一个很好的参考。
您的 M
矩阵是 不是 三角形。三角矩阵的上三角部分(不包括主对角线)或下三角部分(不包括主对角线)全为零。您的 M
矩阵不符合该定义。
带状三角矩阵 M
可能如下所示:
| 2.0 0.0 0.0 |
M = | 1.0 1.0 0.0 |
| 0.0 1.0 1.0 |
让我们以此作为示例,RHS 为 | 2.0 2.0 2.0 |
,并使用为 A
矩阵格式 here 提供的建议。在那种情况下,我们的 A
矩阵看起来像:
A = | 2.0 1.0 1.0 | (the main diagonal of M)
| 1.0 1.0 0.0 | (the first sub-diagonal of M)
在这种情况下,我们的 A
矩阵有 2 行,因此 A
的前导维度由 lda = 2
给出
如果我们将所有这些都放入您的测试框架中,它似乎会给出正确的结果:
$ cat t158.cu
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
void printMatrix2(int m, int n, const double* A, int lda, const char* name)
{
printf("%18s", "");
for (int col = 0; col < n; col++) { printf("%7s(*,%2d) ", name, col + 1); }
printf("\n");
for (int row = 0; row < m; row++) {
printf("%4s(%2d,*) = ", name, row + 1);
for (int col = 0; col < n; col++) {
double Areg = A[row + col * lda];
printf("%20.9f", Areg);
}
printf("\n");
}
return;
}
int main(int argc, char* argv[])
{
printf("\n**** example of cublasDtbsv \n\n");
const int n = 3;
// const int ldm = n;
const int k = 1;
const int lda = k+1;
const int nrhs = 1;
const int incx = 1;
/*
double M[ldm * n] = { 2.0, 0.0, 0.0
, 1.0, 1.0, 0.0
, 0.0, 1.0, 1.0
};
*/
double A[lda * n] = { 2.0, 1.0, 1.0
, 1.0, 1.0, 0.0
};
double x[n * nrhs] = { 2.0
, 2.0
, 2.0
};
cublasHandle_t cublasHandle = NULL;
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
double* d_A = NULL; /* device copy of A */
double* d_x = NULL; /* device copy of x */
printf("example of tbsv \n");
printf("A = (matlab base-1)\n");
printMatrix2(n, n, A, lda, "A");
printf("=====\n");
printf("x (b) = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* step 1: create cublas handle */
cublasStatus = cublasCreate(&cublasHandle);
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * n);
cudaStat2 = cudaMalloc((void**)&d_x, sizeof(double) * n * nrhs);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * n, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_x, x, sizeof(double) * n * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/*
* step 5: solve A*x = b
*
*/
cublasStatus = cublasDtbsv(cublasHandle
, CUBLAS_FILL_MODE_LOWER
, CUBLAS_OP_N
, CUBLAS_DIAG_NON_UNIT
, n
, k
, d_A
, lda
, d_x
, incx
);
cudaStat1 = cudaDeviceSynchronize();
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
cudaStat1 = cudaMemcpy(x, d_x, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_x) cudaFree(d_x);
if (cublasHandle) cublasDestroy(cublasHandle);
return 0;
}
$ nvcc -o t158 t158.cu -lcublas
$ ./t158
**** example of cublasDtbsv
example of tbsv
A = (matlab base-1)
A(*, 1) A(*, 2) A(*, 3)
A( 1,*) = 2.000000000 1.000000000 1.000000000
A( 2,*) = 1.000000000 1.000000000 0.000000000
A( 3,*) = 1.000000000 1.000000000 0.000000000
=====
x (b) = (matlab base-1)
x(*, 1)
x( 1,*) = 2.000000000
x( 2,*) = 2.000000000
x( 3,*) = 2.000000000
=====
X = (matlab base-1)
x(*, 1)
x( 1,*) = 1.000000000
x( 2,*) = 1.000000000
x( 3,*) = 1.000000000
=====
$
我刚开始使用 Cuda 库,想求解对称带状矩阵方程。我找到了使用 LU 分解来解决此问题的示例代码。我现在正在尝试使用 cudaBlas 例程 cublasdtbsv 来求解方程。我无法找到此功能的示例代码并整理了我自己的解决方案。我认为我遇到的问题是我不明白为此例程输入 A 矩阵的正确方法。这是我的示例代码,用于一个非常简单的 3x3 矩阵,其中一个是右手边。它包括使用 LU 分解的正确解决方案和我尝试使用 cublasdtbsv 例程:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
void test();
void printMatrix2(int m, int n, const double* A, int lda, const char* name);
int LUFactorizationSolver2();
int TriBandedSymSolver2();
int main(int argc, char* argv[])
{
test();
return 0;
}
void test()
{
LUFactorizationSolver2();
TriBandedSymSolver2();
return;
}
int TriBandedSymSolver2()
{
printf("\n**** example of cublasDtbsv \n\n");
const int n = 3;
const int ldm = n;
const int k = 1;// n - 1;
const int lda = n;
const int nrhs = 1;
const int incx = 1;
double M[ldm * n] = { 1.0, 0.0, 0.0
, 0.0, 2.0, 3.0
, 0.0, 3.0, 4.0
};
double A[lda * n] = { 1.0, 2.0, 4.0
, 0.0, 3.0, 0.0
, 0.0, 0.0, 0.0
};
double x[n * nrhs] = { 00.0
, 40.0
, 00.0
};
cublasHandle_t cublasHandle = NULL;
cudaStream_t stream = NULL;
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
double* d_A = NULL; /* device copy of A */
double* d_x = NULL; /* device copy of x */
printf("example of tbsv \n");
printf("A = (matlab base-1)\n");
printMatrix2(n, n, A, lda, "A");
printf("=====\n");
printf("x (b) = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
cublasStatus = cublasCreate(&cublasHandle);
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * n);
cudaStat2 = cudaMalloc((void**)&d_x, sizeof(double) * n * nrhs);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * n, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_x, x, sizeof(double) * n * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/*
* step 5: solve A*x = b
*
*/
cublasStatus = cublasDtbsv(cublasHandle
, CUBLAS_FILL_MODE_LOWER
, CUBLAS_OP_N
, CUBLAS_DIAG_NON_UNIT
, n
, k
, d_A
, lda
, d_x
, incx
);
cudaStat1 = cudaDeviceSynchronize();
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
cudaStat1 = cudaMemcpy(x, d_x, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_x) cudaFree(d_x);
if (cublasHandle) cublasDestroy(cublasHandle);
if (stream) cudaStreamDestroy(stream);
cudaDeviceReset();
return 0;
}
int LUFactorizationSolver2()
{
printf("\n**** example of cusolverDnDgetrs \n\n");
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
const int m = 3;
const int lda = m;
const int nrhs = 1; // number of right-hand sides
const int ldb = m;
double A[lda * m] = { 1.0, 0.0, 0.0
, 0.0, 2.0, 3.0
, 0.0, 3.0, 4.0
};
double B[m * nrhs] = { 00.0
, 40.0
, 00.0
};
double X[m * nrhs]; /* X = A\B */
double LU[lda * m]; /* L and U */
int Ipiv[m]; /* host copy of pivoting sequence */
int info = 0; /* host copy of error info */
double* d_A = NULL; /* device copy of A */
double* d_B = NULL; /* device copy of B */
int* d_Ipiv = NULL; /* pivoting sequence */
int* d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
double* d_work = NULL; /* device workspace for getrf */
const int pivot_on = 0; // 1;
if (pivot_on) {
printf("pivot is on : compute P*A = L*U \n");
}
else {
printf("pivot is off: compute A = L*U (not numerically stable)\n");
}
printf("A = (matlab base-1)\n");
printMatrix2(m, m, A, lda, "A");
printf("=====\n");
printf("B = (matlab base-1)\n");
printMatrix2(m, nrhs, B, ldb, "B");
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * m);
cudaStat2 = cudaMalloc((void**)&d_B, sizeof(double) * m * nrhs);
cudaStat2 = cudaMalloc((void**)&d_Ipiv, sizeof(int) * m);
cudaStat4 = cudaMalloc((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * m, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_B, B, sizeof(double) * m * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/* step 3: query working space of getrf */
status = cusolverDnDgetrf_bufferSize(
cusolverH,
m,
m,
d_A,
lda,
&lwork);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double) * lwork);
assert(cudaSuccess == cudaStat1);
/* step 4: LU factorization */
if (pivot_on) {
status = cusolverDnDgetrf(
cusolverH,
m,
m,
d_A,
lda,
d_work,
d_Ipiv,
d_info);
}
else {
status = cusolverDnDgetrf(
cusolverH,
m,
m,
d_A,
lda,
d_work,
NULL,
d_info);
}
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
if (pivot_on) {
cudaStat1 = cudaMemcpy(Ipiv, d_Ipiv, sizeof(int) * m, cudaMemcpyDeviceToHost);
}
cudaStat2 = cudaMemcpy(LU, d_A, sizeof(double) * lda * m, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
if (0 > info) {
printf("%d-th parameter is wrong \n", -info);
exit(1);
}
if (pivot_on) {
printf("pivoting sequence, matlab base-1\n");
for (int j = 0; j < m; j++) {
printf("Ipiv(%d) = %d\n", j + 1, Ipiv[j]);
}
}
printf("L and U = (matlab base-1)\n");
printMatrix2(m, m, LU, lda, "LU");
printf("=====\n");
/*
* step 5: solve A*X = B
*
*/
if (pivot_on) {
status = cusolverDnDgetrs(
cusolverH,
CUBLAS_OP_N,
m,
nrhs, /* nrhs */
d_A,
lda,
d_Ipiv,
d_B,
ldb,
d_info);
}
else {
status = cusolverDnDgetrs(
cusolverH,
CUBLAS_OP_N,
m,
nrhs, /* nrhs */
d_A,
lda,
NULL,
d_B,
ldb,
d_info);
}
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(X, d_B, sizeof(double) * m * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(m, nrhs, X, ldb, "X");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_B) cudaFree(d_B);
if (d_Ipiv) cudaFree(d_Ipiv);
if (d_info) cudaFree(d_info);
if (d_work) cudaFree(d_work);
if (cusolverH) cusolverDnDestroy(cusolverH);
if (stream) cudaStreamDestroy(stream);
cudaDeviceReset();
return 0;
}
void printMatrix2(int m, int n, const double* A, int lda, const char* name)
{
printf("%18s", "");
for (int col = 0; col < n; col++) { printf("%7s(*,%2d) ", name, col + 1); }
printf("\n");
for (int row = 0; row < m; row++) {
printf("%4s(%2d,*) = ", name, row + 1);
for (int col = 0; col < n; col++) {
double Areg = A[row + col * lda];
printf("%20.9f", Areg);
}
printf("\n");
}
return;
}
正确答案应该是:
0.00
-160.00
120.00
但我得到:
0.000000000
inf
-inf
我正在 Windows 10 使用 Visual Studio 2019 开发这个。
我遗漏了什么或者有人可以为我指出 cublasDtbsv 例程的工作示例吗?
CUBLAS tbsv is a banded triangular solver. It expects your M
matrix to be banded and triangular. If you would like to see what that looks like, this 是一个很好的参考。
您的 M
矩阵是 不是 三角形。三角矩阵的上三角部分(不包括主对角线)或下三角部分(不包括主对角线)全为零。您的 M
矩阵不符合该定义。
带状三角矩阵 M
可能如下所示:
| 2.0 0.0 0.0 |
M = | 1.0 1.0 0.0 |
| 0.0 1.0 1.0 |
让我们以此作为示例,RHS 为 | 2.0 2.0 2.0 |
,并使用为 A
矩阵格式 here 提供的建议。在那种情况下,我们的 A
矩阵看起来像:
A = | 2.0 1.0 1.0 | (the main diagonal of M)
| 1.0 1.0 0.0 | (the first sub-diagonal of M)
在这种情况下,我们的 A
矩阵有 2 行,因此 A
的前导维度由 lda = 2
如果我们将所有这些都放入您的测试框架中,它似乎会给出正确的结果:
$ cat t158.cu
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
void printMatrix2(int m, int n, const double* A, int lda, const char* name)
{
printf("%18s", "");
for (int col = 0; col < n; col++) { printf("%7s(*,%2d) ", name, col + 1); }
printf("\n");
for (int row = 0; row < m; row++) {
printf("%4s(%2d,*) = ", name, row + 1);
for (int col = 0; col < n; col++) {
double Areg = A[row + col * lda];
printf("%20.9f", Areg);
}
printf("\n");
}
return;
}
int main(int argc, char* argv[])
{
printf("\n**** example of cublasDtbsv \n\n");
const int n = 3;
// const int ldm = n;
const int k = 1;
const int lda = k+1;
const int nrhs = 1;
const int incx = 1;
/*
double M[ldm * n] = { 2.0, 0.0, 0.0
, 1.0, 1.0, 0.0
, 0.0, 1.0, 1.0
};
*/
double A[lda * n] = { 2.0, 1.0, 1.0
, 1.0, 1.0, 0.0
};
double x[n * nrhs] = { 2.0
, 2.0
, 2.0
};
cublasHandle_t cublasHandle = NULL;
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
double* d_A = NULL; /* device copy of A */
double* d_x = NULL; /* device copy of x */
printf("example of tbsv \n");
printf("A = (matlab base-1)\n");
printMatrix2(n, n, A, lda, "A");
printf("=====\n");
printf("x (b) = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* step 1: create cublas handle */
cublasStatus = cublasCreate(&cublasHandle);
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * n);
cudaStat2 = cudaMalloc((void**)&d_x, sizeof(double) * n * nrhs);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * n, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_x, x, sizeof(double) * n * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/*
* step 5: solve A*x = b
*
*/
cublasStatus = cublasDtbsv(cublasHandle
, CUBLAS_FILL_MODE_LOWER
, CUBLAS_OP_N
, CUBLAS_DIAG_NON_UNIT
, n
, k
, d_A
, lda
, d_x
, incx
);
cudaStat1 = cudaDeviceSynchronize();
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
cudaStat1 = cudaMemcpy(x, d_x, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_x) cudaFree(d_x);
if (cublasHandle) cublasDestroy(cublasHandle);
return 0;
}
$ nvcc -o t158 t158.cu -lcublas
$ ./t158
**** example of cublasDtbsv
example of tbsv
A = (matlab base-1)
A(*, 1) A(*, 2) A(*, 3)
A( 1,*) = 2.000000000 1.000000000 1.000000000
A( 2,*) = 1.000000000 1.000000000 0.000000000
A( 3,*) = 1.000000000 1.000000000 0.000000000
=====
x (b) = (matlab base-1)
x(*, 1)
x( 1,*) = 2.000000000
x( 2,*) = 2.000000000
x( 3,*) = 2.000000000
=====
X = (matlab base-1)
x(*, 1)
x( 1,*) = 1.000000000
x( 2,*) = 1.000000000
x( 3,*) = 1.000000000
=====
$