LU 后的 cublasDtrsm 旋转

cublasDtrsm after LU with pivoting

我卡在了一个小问题上。我必须求解一个线性系统 A * x = b

矩阵 A 被 LU 因式分解 (LAPACK) 分解。结果我得到了因式分解矩阵和主元数组。之后,我想用 *cublasDtrsm* 在 GPU 上解决两个线性系统:U * x = yL * y = b。但是由于 dgetrfLAPACK 中的行交换,我必须将主元数组传递给 cublas。但是 *cublasDtrsm*-函数没有为此提供任何东西。没有枢轴数组,我会得到错误的结果。

我已经在 LAPACK 中搜索过禁用旋转,但考虑到稳定性,这是不可能的。是否有任何提示如何使用 LU 分解求解线性方程组?

如果您想使用这种特殊方法(LAPACK getrf 之后的 cublas trsm),我相信您应该能够通过重新排列 b 向量(或矩阵)以匹配来将 cublas trsm 与 LAPACK 的 L、U 输出一起使用LAPACK 在旋转过程中执行的重排顺序。我相信此顺序在 LAPACK documentation:

中的 ipiv 的公式中给出

IPIV
IPIV is INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i).

下面是一个示例代码,演示了使用单个 RHS 向量的简单 3x3 测试用例的想法:

$ cat t853.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define cudacall(call)                                                                                                          \
    do                                                                                                                          \
    {                                                                                                                           \
        cudaError_t err = (call);                                                                                               \
        if(cudaSuccess != err)                                                                                                  \
        {                                                                                                                       \
            fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err));    \
            cudaDeviceReset();                                                                                                  \
            exit(EXIT_FAILURE);                                                                                                 \
        }                                                                                                                       \
    }                                                                                                                           \
    while (0)

#define cublascall(call)                                                                                        \
    do                                                                                                          \
    {                                                                                                           \
        cublasStatus_t status = (call);                                                                         \
        if(CUBLAS_STATUS_SUCCESS != status)                                                                     \
        {                                                                                                       \
            fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);     \
            cudaDeviceReset();                                                                                  \
            exit(EXIT_FAILURE);                                                                                 \
        }                                                                                                       \
                                                                                                                \
    }                                                                                                           \
    while(0)


void LU_device(float *src_d, int n, int *pivot)
{
    cublasHandle_t handle;
    cublascall(cublasCreate_v2(&handle));

    int batchSize = 1;

    int *P, *INFO;

    cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
    cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));

    int lda = n;

    float *A[] = { src_d };
    float **A_d;
    cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
    cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));

    cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));

    int INFOh = 0;
    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
    cudacall(cudaMemcpy(pivot,P,n*batchSize*sizeof(int),cudaMemcpyDeviceToHost));
#ifdef DEBUG_PRINT
    for (int qq = 0; qq < n*batchSize; qq++) {printf("pivot[%d] = %d\n", qq, pivot[qq]); }
#endif

    if(INFOh == n)
    {
        fprintf(stderr, "Factorization Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
    cudaFree(P); cudaFree(INFO); cudaFree(A_d); cublasDestroy(handle);
}

void LU(float* src, float* L, float *U, int n, int *pivot)
{
    float *src_d;

    cudacall(cudaMalloc<float>(&src_d, n*n * sizeof(float)));
    cudacall(cudaMemcpy(src_d,src,n*n * sizeof(float),cudaMemcpyHostToDevice));

    LU_device(src_d,n,pivot);

    cudacall(cudaMemcpy(L,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
    cudacall(cudaMemcpy(U,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
    for (int i = 0; i < n; i ++){
      for (int j = 0; j < i; j++)   L[i*n+j] = 0.0;
      for (int j = i+1; j < n; j++) U[i*n+j] = 0.0;}

    cudaFree(src_d);
}

void rearrange(float *vec, int *pivot, int n, int dir){
#define DIR_FORWARD 0
#define DIR_REVERSE 1
#define SWAP(x,y) {float swaptmp=(*(y)); (*(y))=(*(x)); (*(x))=swaptmp;}
  if (dir == DIR_FORWARD)
    for (int i = 0; i < n; i++)    SWAP((vec+i),(vec+pivot[i]-1))
  else
    for (int i = n-1; i >= 0; i--) SWAP((vec+i),(vec+pivot[i]-1))
}


void TRSM(float *A, float *x, float *b, int n, cublasFillMode_t uplo, cublasDiagType_t diagt ){

    cublasHandle_t handle;
    cublascall(cublasCreate_v2(&handle));
    float *A_d, *b_d;
    cudacall(cudaMalloc<float>(&A_d, n*n * sizeof(float)));
    cudacall(cudaMalloc<float>(&b_d, n * sizeof(float)));
    cudacall(cudaMemcpy(b_d, b,   n*sizeof(float), cudaMemcpyHostToDevice));
    cudacall(cudaMemcpy(A_d, A, n*n*sizeof(float), cudaMemcpyHostToDevice));
    const float alpha = 1.0f;
    cublascall(cublasStrsm(handle, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, diagt, n, 1, &alpha, A_d, n, b_d, n));
    cudacall(cudaMemcpy(x, b_d, n*sizeof(float), cudaMemcpyDeviceToHost));
    cudaFree(A_d); cudaFree(b_d); cublasDestroy(handle);
}

void test_solve()
{
 // solve Ax=b
 // 1. Perform LU on A
 // 2. using pivot sequence, rearrange b -> b'
 // 3. perform TRSM on Ly=b'
 // 4. perform TRSM on Ux=y
 // A = |0 1  4 |
 //     |3 3  9 |
 //     |4 10 16|
 // x = |1|
 //     |2|
 //     |3|
 // b = |14|
 //     |36|
 //     |72|

    const int n = 3;

// has 3,2,3 pivot order
    float          A_col_major[n*n] = { 0, 3, 4,
                                        1, 3, 10,
                                        4, 9, 16 };
    float b1[n] = {14, 36, 72};
/* another example - has 3,3,3 pivot order
    float          A_transpose[n*n] = { 0, 1,  4,
                                        3, 3,  9,
                                        4, 10, 16 };
    float b2[n] = {18, 37, 70};
*/
    float result_x[n];
    int pivot[n];
    float L[n*n];
    float U[n*n];
    float y[n];

    //Select matrix by setting "a"
    float *a = A_col_major;
    float *b = b1;

    printf("Input:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            printf("%f\t",a[i*n+j]);
        printf("\n");
    }

    printf("\n\n");
// 1. LU on A
    LU(a,L,U,n,pivot);
#ifdef DEBUG_PRINT
    printf("L:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            printf("%f\t",L[i*n+j]);
        printf("\n");
    }

    printf("\n\n");
    printf("U:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            printf("%f\t",U[i*n+j]);
        printf("\n");
    }

    printf("\n\n");

#endif
// 2. Rearrange b
    rearrange(b,pivot,n,DIR_FORWARD);
#ifdef DEBUG_PRINT
   for (int i = 0; i < n; i++) printf("b'[%d] = %f\n", i, b[i]);
#endif
// 3. TRSM on Ly=b
    TRSM(L, y, b, n, CUBLAS_FILL_MODE_LOWER, CUBLAS_DIAG_UNIT);
// 4. TRSM on Ux=y
    TRSM(U, result_x, y, n, CUBLAS_FILL_MODE_UPPER, CUBLAS_DIAG_NON_UNIT);

    fprintf(stdout, "Solution:\n\n");
    for(int i=0; i<n; i++)
    {
            printf("%f\n",result_x[i]);
    }

}

int main()
{
    test_solve();

    return 0;
}

$ nvcc -o t853 t853.cu -lcublas
$ ./t853
Input:

0.000000        3.000000        4.000000
1.000000        3.000000        10.000000
4.000000        9.000000        16.000000


Solution:

1.000000
2.000000
3.000000
$

请注意,对于这个简单的测试用例,我使用 cublas getrfBatched 进行矩阵 LU 分解,而不是 LAPACK,但我认为它的行为应该与 LAPACK 类似。

另请注意,我无意对 "best approaches for linear system solutions" 发表评论,而只是解释您制定的方法如何发挥作用。

对于 GPU 上的置换,可以根据给定向量创建置换矩阵,并在 GPU 上将其与 B 相乘。事实上,来自 LAPACK 的置换向量意味着交换步骤的顺序。因此,如果第 n 行已被 for 循环触及,它将永远不会再次被触及。因此,一个小算法从 *<T>getrf* 的向量中创建一个置换矩阵 P。这样线性系统 L * U * X = P * B 就解决了。这会导致正确的结果。

void
permutationMatrix ( int const rows,      //number of rows of A
                    int const cols,      //number of cols of A
                    int* permArray,      //permutation vector from LAPACK
                    double* permMatrix)  //Memory for permutation matrix
{

    int tempPerm [rows];    //holds where the ones later shall be in the Matrix
    int swap;               //variable for swapping

    memset(permMatrix,0, rows * cols * sizeof(double)); //fill permutation Matrix with 0s
    memset(tempPerm,0, rows * sizeof(int)); //fill temporary memory with 0s

    for (int row = 0; row < rows; row ++)  
    {
        //start value for each temp field is the row-number
        if (tempPerm [row] == 0)
        {
            tempPerm [row] = row + 1;
        }

        /* rows need to be swapped if rownumber != number 
         * in permutation vector of LAPACK*/
        if (permArray[row] != row + 1)
        {
            //swap with a line which hasn't already swapped
            if (tempPerm[permArray[row]-1] == 0)
            {
                tempPerm[permArray[row]-1] = tempPerm[row];
                tempPerm[row] = permArray[row];
            }else{

                //swap with an already touched line
                swap = tempPerm[permArray[row]-1];
                tempPerm[permArray[row]-1] = tempPerm[row];
                tempPerm[row] = swap;
            }
        }

        //put the one in place in the permutation matrix
        permMatrix[row + (tempPerm[row]-1) * rows] = 1.0;
    }
}