LU 后的 cublasDtrsm 旋转
cublasDtrsm after LU with pivoting
我卡在了一个小问题上。我必须求解一个线性系统 A * x = b
。
矩阵 A
被 LU 因式分解 (LAPACK
) 分解。结果我得到了因式分解矩阵和主元数组。之后,我想用 *cublasDtrsm*
在 GPU 上解决两个线性系统:U * x = y
和 L * y = b
。但是由于 dgetrf
在 LAPACK
中的行交换,我必须将主元数组传递给 cublas
。但是 *cublasDtrsm*
-函数没有为此提供任何东西。没有枢轴数组,我会得到错误的结果。
我已经在 LAPACK
中搜索过禁用旋转,但考虑到稳定性,这是不可能的。是否有任何提示如何使用 LU 分解求解线性方程组?
如果您想使用这种特殊方法(LAPACK getrf 之后的 cublas trsm),我相信您应该能够通过重新排列 b 向量(或矩阵)以匹配来将 cublas trsm 与 LAPACK 的 L、U 输出一起使用LAPACK 在旋转过程中执行的重排顺序。我相信此顺序在 LAPACK documentation:
中的 ipiv
的公式中给出
IPIV
IPIV is INTEGER array, dimension (min(M,N))
The pivot indices; for 1 <= i <= min(M,N), row i of the
matrix was interchanged with row IPIV(i).
下面是一个示例代码,演示了使用单个 RHS 向量的简单 3x3 测试用例的想法:
$ cat t853.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void LU_device(float *src_d, int n, int *pivot)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float **A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
cudacall(cudaMemcpy(pivot,P,n*batchSize*sizeof(int),cudaMemcpyDeviceToHost));
#ifdef DEBUG_PRINT
for (int qq = 0; qq < n*batchSize; qq++) {printf("pivot[%d] = %d\n", qq, pivot[qq]); }
#endif
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P); cudaFree(INFO); cudaFree(A_d); cublasDestroy(handle);
}
void LU(float* src, float* L, float *U, int n, int *pivot)
{
float *src_d;
cudacall(cudaMalloc<float>(&src_d, n*n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n*n * sizeof(float),cudaMemcpyHostToDevice));
LU_device(src_d,n,pivot);
cudacall(cudaMemcpy(L,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudacall(cudaMemcpy(U,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i ++){
for (int j = 0; j < i; j++) L[i*n+j] = 0.0;
for (int j = i+1; j < n; j++) U[i*n+j] = 0.0;}
cudaFree(src_d);
}
void rearrange(float *vec, int *pivot, int n, int dir){
#define DIR_FORWARD 0
#define DIR_REVERSE 1
#define SWAP(x,y) {float swaptmp=(*(y)); (*(y))=(*(x)); (*(x))=swaptmp;}
if (dir == DIR_FORWARD)
for (int i = 0; i < n; i++) SWAP((vec+i),(vec+pivot[i]-1))
else
for (int i = n-1; i >= 0; i--) SWAP((vec+i),(vec+pivot[i]-1))
}
void TRSM(float *A, float *x, float *b, int n, cublasFillMode_t uplo, cublasDiagType_t diagt ){
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
float *A_d, *b_d;
cudacall(cudaMalloc<float>(&A_d, n*n * sizeof(float)));
cudacall(cudaMalloc<float>(&b_d, n * sizeof(float)));
cudacall(cudaMemcpy(b_d, b, n*sizeof(float), cudaMemcpyHostToDevice));
cudacall(cudaMemcpy(A_d, A, n*n*sizeof(float), cudaMemcpyHostToDevice));
const float alpha = 1.0f;
cublascall(cublasStrsm(handle, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, diagt, n, 1, &alpha, A_d, n, b_d, n));
cudacall(cudaMemcpy(x, b_d, n*sizeof(float), cudaMemcpyDeviceToHost));
cudaFree(A_d); cudaFree(b_d); cublasDestroy(handle);
}
void test_solve()
{
// solve Ax=b
// 1. Perform LU on A
// 2. using pivot sequence, rearrange b -> b'
// 3. perform TRSM on Ly=b'
// 4. perform TRSM on Ux=y
// A = |0 1 4 |
// |3 3 9 |
// |4 10 16|
// x = |1|
// |2|
// |3|
// b = |14|
// |36|
// |72|
const int n = 3;
// has 3,2,3 pivot order
float A_col_major[n*n] = { 0, 3, 4,
1, 3, 10,
4, 9, 16 };
float b1[n] = {14, 36, 72};
/* another example - has 3,3,3 pivot order
float A_transpose[n*n] = { 0, 1, 4,
3, 3, 9,
4, 10, 16 };
float b2[n] = {18, 37, 70};
*/
float result_x[n];
int pivot[n];
float L[n*n];
float U[n*n];
float y[n];
//Select matrix by setting "a"
float *a = A_col_major;
float *b = b1;
printf("Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",a[i*n+j]);
printf("\n");
}
printf("\n\n");
// 1. LU on A
LU(a,L,U,n,pivot);
#ifdef DEBUG_PRINT
printf("L:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",L[i*n+j]);
printf("\n");
}
printf("\n\n");
printf("U:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",U[i*n+j]);
printf("\n");
}
printf("\n\n");
#endif
// 2. Rearrange b
rearrange(b,pivot,n,DIR_FORWARD);
#ifdef DEBUG_PRINT
for (int i = 0; i < n; i++) printf("b'[%d] = %f\n", i, b[i]);
#endif
// 3. TRSM on Ly=b
TRSM(L, y, b, n, CUBLAS_FILL_MODE_LOWER, CUBLAS_DIAG_UNIT);
// 4. TRSM on Ux=y
TRSM(U, result_x, y, n, CUBLAS_FILL_MODE_UPPER, CUBLAS_DIAG_NON_UNIT);
fprintf(stdout, "Solution:\n\n");
for(int i=0; i<n; i++)
{
printf("%f\n",result_x[i]);
}
}
int main()
{
test_solve();
return 0;
}
$ nvcc -o t853 t853.cu -lcublas
$ ./t853
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Solution:
1.000000
2.000000
3.000000
$
请注意,对于这个简单的测试用例,我使用 cublas getrfBatched 进行矩阵 LU 分解,而不是 LAPACK,但我认为它的行为应该与 LAPACK 类似。
另请注意,我无意对 "best approaches for linear system solutions" 发表评论,而只是解释您制定的方法如何发挥作用。
对于 GPU 上的置换,可以根据给定向量创建置换矩阵,并在 GPU 上将其与 B
相乘。事实上,来自 LAPACK
的置换向量意味着交换步骤的顺序。因此,如果第 n 行已被 for 循环触及,它将永远不会再次被触及。因此,一个小算法从 *<T>getrf*
的向量中创建一个置换矩阵 P
。这样线性系统 L * U * X = P * B
就解决了。这会导致正确的结果。
void
permutationMatrix ( int const rows, //number of rows of A
int const cols, //number of cols of A
int* permArray, //permutation vector from LAPACK
double* permMatrix) //Memory for permutation matrix
{
int tempPerm [rows]; //holds where the ones later shall be in the Matrix
int swap; //variable for swapping
memset(permMatrix,0, rows * cols * sizeof(double)); //fill permutation Matrix with 0s
memset(tempPerm,0, rows * sizeof(int)); //fill temporary memory with 0s
for (int row = 0; row < rows; row ++)
{
//start value for each temp field is the row-number
if (tempPerm [row] == 0)
{
tempPerm [row] = row + 1;
}
/* rows need to be swapped if rownumber != number
* in permutation vector of LAPACK*/
if (permArray[row] != row + 1)
{
//swap with a line which hasn't already swapped
if (tempPerm[permArray[row]-1] == 0)
{
tempPerm[permArray[row]-1] = tempPerm[row];
tempPerm[row] = permArray[row];
}else{
//swap with an already touched line
swap = tempPerm[permArray[row]-1];
tempPerm[permArray[row]-1] = tempPerm[row];
tempPerm[row] = swap;
}
}
//put the one in place in the permutation matrix
permMatrix[row + (tempPerm[row]-1) * rows] = 1.0;
}
}
我卡在了一个小问题上。我必须求解一个线性系统 A * x = b
。
矩阵 A
被 LU 因式分解 (LAPACK
) 分解。结果我得到了因式分解矩阵和主元数组。之后,我想用 *cublasDtrsm*
在 GPU 上解决两个线性系统:U * x = y
和 L * y = b
。但是由于 dgetrf
在 LAPACK
中的行交换,我必须将主元数组传递给 cublas
。但是 *cublasDtrsm*
-函数没有为此提供任何东西。没有枢轴数组,我会得到错误的结果。
我已经在 LAPACK
中搜索过禁用旋转,但考虑到稳定性,这是不可能的。是否有任何提示如何使用 LU 分解求解线性方程组?
如果您想使用这种特殊方法(LAPACK getrf 之后的 cublas trsm),我相信您应该能够通过重新排列 b 向量(或矩阵)以匹配来将 cublas trsm 与 LAPACK 的 L、U 输出一起使用LAPACK 在旋转过程中执行的重排顺序。我相信此顺序在 LAPACK documentation:
中的ipiv
的公式中给出
IPIV
IPIV is INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).
下面是一个示例代码,演示了使用单个 RHS 向量的简单 3x3 测试用例的想法:
$ cat t853.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void LU_device(float *src_d, int n, int *pivot)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float **A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
cudacall(cudaMemcpy(pivot,P,n*batchSize*sizeof(int),cudaMemcpyDeviceToHost));
#ifdef DEBUG_PRINT
for (int qq = 0; qq < n*batchSize; qq++) {printf("pivot[%d] = %d\n", qq, pivot[qq]); }
#endif
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P); cudaFree(INFO); cudaFree(A_d); cublasDestroy(handle);
}
void LU(float* src, float* L, float *U, int n, int *pivot)
{
float *src_d;
cudacall(cudaMalloc<float>(&src_d, n*n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n*n * sizeof(float),cudaMemcpyHostToDevice));
LU_device(src_d,n,pivot);
cudacall(cudaMemcpy(L,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudacall(cudaMemcpy(U,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i ++){
for (int j = 0; j < i; j++) L[i*n+j] = 0.0;
for (int j = i+1; j < n; j++) U[i*n+j] = 0.0;}
cudaFree(src_d);
}
void rearrange(float *vec, int *pivot, int n, int dir){
#define DIR_FORWARD 0
#define DIR_REVERSE 1
#define SWAP(x,y) {float swaptmp=(*(y)); (*(y))=(*(x)); (*(x))=swaptmp;}
if (dir == DIR_FORWARD)
for (int i = 0; i < n; i++) SWAP((vec+i),(vec+pivot[i]-1))
else
for (int i = n-1; i >= 0; i--) SWAP((vec+i),(vec+pivot[i]-1))
}
void TRSM(float *A, float *x, float *b, int n, cublasFillMode_t uplo, cublasDiagType_t diagt ){
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
float *A_d, *b_d;
cudacall(cudaMalloc<float>(&A_d, n*n * sizeof(float)));
cudacall(cudaMalloc<float>(&b_d, n * sizeof(float)));
cudacall(cudaMemcpy(b_d, b, n*sizeof(float), cudaMemcpyHostToDevice));
cudacall(cudaMemcpy(A_d, A, n*n*sizeof(float), cudaMemcpyHostToDevice));
const float alpha = 1.0f;
cublascall(cublasStrsm(handle, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, diagt, n, 1, &alpha, A_d, n, b_d, n));
cudacall(cudaMemcpy(x, b_d, n*sizeof(float), cudaMemcpyDeviceToHost));
cudaFree(A_d); cudaFree(b_d); cublasDestroy(handle);
}
void test_solve()
{
// solve Ax=b
// 1. Perform LU on A
// 2. using pivot sequence, rearrange b -> b'
// 3. perform TRSM on Ly=b'
// 4. perform TRSM on Ux=y
// A = |0 1 4 |
// |3 3 9 |
// |4 10 16|
// x = |1|
// |2|
// |3|
// b = |14|
// |36|
// |72|
const int n = 3;
// has 3,2,3 pivot order
float A_col_major[n*n] = { 0, 3, 4,
1, 3, 10,
4, 9, 16 };
float b1[n] = {14, 36, 72};
/* another example - has 3,3,3 pivot order
float A_transpose[n*n] = { 0, 1, 4,
3, 3, 9,
4, 10, 16 };
float b2[n] = {18, 37, 70};
*/
float result_x[n];
int pivot[n];
float L[n*n];
float U[n*n];
float y[n];
//Select matrix by setting "a"
float *a = A_col_major;
float *b = b1;
printf("Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",a[i*n+j]);
printf("\n");
}
printf("\n\n");
// 1. LU on A
LU(a,L,U,n,pivot);
#ifdef DEBUG_PRINT
printf("L:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",L[i*n+j]);
printf("\n");
}
printf("\n\n");
printf("U:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",U[i*n+j]);
printf("\n");
}
printf("\n\n");
#endif
// 2. Rearrange b
rearrange(b,pivot,n,DIR_FORWARD);
#ifdef DEBUG_PRINT
for (int i = 0; i < n; i++) printf("b'[%d] = %f\n", i, b[i]);
#endif
// 3. TRSM on Ly=b
TRSM(L, y, b, n, CUBLAS_FILL_MODE_LOWER, CUBLAS_DIAG_UNIT);
// 4. TRSM on Ux=y
TRSM(U, result_x, y, n, CUBLAS_FILL_MODE_UPPER, CUBLAS_DIAG_NON_UNIT);
fprintf(stdout, "Solution:\n\n");
for(int i=0; i<n; i++)
{
printf("%f\n",result_x[i]);
}
}
int main()
{
test_solve();
return 0;
}
$ nvcc -o t853 t853.cu -lcublas
$ ./t853
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Solution:
1.000000
2.000000
3.000000
$
请注意,对于这个简单的测试用例,我使用 cublas getrfBatched 进行矩阵 LU 分解,而不是 LAPACK,但我认为它的行为应该与 LAPACK 类似。
另请注意,我无意对 "best approaches for linear system solutions" 发表评论,而只是解释您制定的方法如何发挥作用。
对于 GPU 上的置换,可以根据给定向量创建置换矩阵,并在 GPU 上将其与 B
相乘。事实上,来自 LAPACK
的置换向量意味着交换步骤的顺序。因此,如果第 n 行已被 for 循环触及,它将永远不会再次被触及。因此,一个小算法从 *<T>getrf*
的向量中创建一个置换矩阵 P
。这样线性系统 L * U * X = P * B
就解决了。这会导致正确的结果。
void
permutationMatrix ( int const rows, //number of rows of A
int const cols, //number of cols of A
int* permArray, //permutation vector from LAPACK
double* permMatrix) //Memory for permutation matrix
{
int tempPerm [rows]; //holds where the ones later shall be in the Matrix
int swap; //variable for swapping
memset(permMatrix,0, rows * cols * sizeof(double)); //fill permutation Matrix with 0s
memset(tempPerm,0, rows * sizeof(int)); //fill temporary memory with 0s
for (int row = 0; row < rows; row ++)
{
//start value for each temp field is the row-number
if (tempPerm [row] == 0)
{
tempPerm [row] = row + 1;
}
/* rows need to be swapped if rownumber != number
* in permutation vector of LAPACK*/
if (permArray[row] != row + 1)
{
//swap with a line which hasn't already swapped
if (tempPerm[permArray[row]-1] == 0)
{
tempPerm[permArray[row]-1] = tempPerm[row];
tempPerm[row] = permArray[row];
}else{
//swap with an already touched line
swap = tempPerm[permArray[row]-1];
tempPerm[permArray[row]-1] = tempPerm[row];
tempPerm[row] = swap;
}
}
//put the one in place in the permutation matrix
permMatrix[row + (tempPerm[row]-1) * rows] = 1.0;
}
}