在 CUDA 中求解稀疏正定线性系统

Question

我们在使用 cuSOLVER 的 cusolverSpScsrlsvchol 函数时遇到问题，可能是由于对 cuSOLVER 库的误解。

动机：我们正在求解矩形网格上的泊松方程-divgrad x = b。在具有 5 模板 (1, 1, -4, 1, 1) 的 2 维度中，网格上的拉普拉斯算子提供了一个（相当稀疏的）矩阵 A。此外，网格上的电荷分布给出了一个（密集的）向量 b。 A 是正定对称的。

现在我们使用 CUDA 7.0 附带的 nvidia 新 cuSOLVER 库为 x 求解 A * x = b。它提供了一个函数 cusolverSpScsrlsvchol 应该对浮点数进行稀疏的 Cholesky 分解。

注意：我们能够使用替代的稀疏 QR 分解函数 cusolverSpScsrlsvqr 正确求解系统。对于边缘上所有 b 个条目为 1 且其余为 0 的 4 x 4 网格，我们得到 x:

1 1 0.999999 1 1 1 0.999999 1 1 1 1 1 1 1 1 1

我们的问题：

cusolverSpScsrlsvchol returns x 的错误结果：

1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1

（已解决，请参阅下面的答案）将 CSR 矩阵 A 转换为密集矩阵并显示输出会给出奇怪的数字（10^-44 等）。来自 CSR 格式的相应数据是正确的，并使用 python numpy.

（已解决，请参阅下面的答案）甚至找不到替代的稀疏 LU 和 cusolverSpScsrlsvlu 的部分旋转：

$ nvcc -std=c++11 cusparse_test3.cu -o cusparse_test3 -lcusparse -lcusolver
cusparse_test3.cu(208): error: identifier "cusolverSpScsrlsvlu" is undefined

我们做错了什么？感谢您的帮助！

我们的 C++ CUDA 代码：

#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include <vector>
#include <cassert>


// create poisson matrix with Dirichlet bc. of a rectangular grid with
// dimension NxN
void assemble_poisson_matrix_coo(std::vector<float>& vals, std::vector<int>& row, std::vector<int>& col,
                     std::vector<float>& rhs, int Nrows, int Ncols) {

        //nnz: 5 entries per row (node) for nodes in the interior
    // 1 entry per row (node) for nodes on the boundary, since we set them explicitly to 1.
    int nnz = 5*Nrows*Ncols - (2*(Ncols-1) + 2*(Nrows-1))*4;
    vals.resize(nnz);
    row.resize(nnz);
    col.resize(nnz);
    rhs.resize(Nrows*Ncols);

    int counter = 0;
    for(int i = 0; i < Nrows; ++i) {
        for (int j = 0; j < Ncols; ++j) {
            int idx = j + Ncols*i;
            if (i == 0 || j == 0 || j == Ncols-1 || i == Nrows-1) {
                vals[counter] = 1.;
                row[counter] = idx;
                col[counter] = idx;
                counter++;
                rhs[idx] = 1.;
//                if (i == 0) {
//                    rhs[idx] = 3.;
//                }
            } else { // -laplace stencil
                // above
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx-Ncols;
                counter++;
                // left
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx-1;
                counter++;
                // center
                vals[counter] = 4.;
                row[counter] = idx;
                col[counter] = idx;
                counter++;
                // right
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx+1;
                counter++;
                // below
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx+Ncols;
                counter++;

                rhs[idx] = 0;
            }
        }
    }
    assert(counter == nnz);
}



int main() {
    // --- create library handles:
    cusolverSpHandle_t cusolver_handle;
    cusolverStatus_t cusolver_status;
    cusolver_status = cusolverSpCreate(&cusolver_handle);
    std::cout << "status create cusolver handle: " << cusolver_status << std::endl;

    cusparseHandle_t cusparse_handle;
    cusparseStatus_t cusparse_status;
    cusparse_status = cusparseCreate(&cusparse_handle);
    std::cout << "status create cusparse handle: " << cusparse_status << std::endl;

    // --- prepare matrix:
    int Nrows = 4;
    int Ncols = 4;
    std::vector<float> csrVal;
    std::vector<int> cooRow;
    std::vector<int> csrColInd;
    std::vector<float> b;

    assemble_poisson_matrix_coo(csrVal, cooRow, csrColInd, b, Nrows, Ncols);

    int nnz = csrVal.size();
    int m = Nrows * Ncols;
    std::vector<int> csrRowPtr(m+1);

    // --- prepare solving and copy to GPU:
    std::vector<float> x(m);
    float tol = 1e-5;
    int reorder = 0;
    int singularity = 0;

    float *db, *dcsrVal, *dx;
    int *dcsrColInd, *dcsrRowPtr, *dcooRow;
    cudaMalloc((void**)&db, m*sizeof(float));
    cudaMalloc((void**)&dx, m*sizeof(float));
    cudaMalloc((void**)&dcsrVal, nnz*sizeof(float));
    cudaMalloc((void**)&dcsrColInd, nnz*sizeof(int));
    cudaMalloc((void**)&dcsrRowPtr, (m+1)*sizeof(int));
    cudaMalloc((void**)&dcooRow, nnz*sizeof(int));

    cudaMemcpy(db, b.data(), b.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dcsrVal, csrVal.data(), csrVal.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dcsrColInd, csrColInd.data(), csrColInd.size()*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dcooRow, cooRow.data(), cooRow.size()*sizeof(int), cudaMemcpyHostToDevice);

    cusparse_status = cusparseXcoo2csr(cusparse_handle, dcooRow, nnz, m,
                                       dcsrRowPtr, CUSPARSE_INDEX_BASE_ZERO);
    std::cout << "status cusparse coo2csr conversion: " << cusparse_status << std::endl;

    cudaDeviceSynchronize(); // matrix format conversion has to be finished!

    // --- everything ready for computation:

    cusparseMatDescr_t descrA;

    cusparse_status = cusparseCreateMatDescr(&descrA);
    std::cout << "status cusparse createMatDescr: " << cusparse_status << std::endl;

    // optional: print dense matrix that has been allocated on GPU

    std::vector<float> A(m*m, 0);
    float *dA;
    cudaMalloc((void**)&dA, A.size()*sizeof(float));

    cusparseScsr2dense(cusparse_handle, m, m, descrA, dcsrVal,
                       dcsrRowPtr, dcsrColInd, dA, m);

    cudaMemcpy(A.data(), dA, A.size()*sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "A: \n";
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < m; ++j) {
            std::cout << A[i*m + j] << " ";
        }
        std::cout << std::endl;
    }

    cudaFree(dA);

    std::cout << "b: \n";
    cudaMemcpy(b.data(), db, (m)*sizeof(int), cudaMemcpyDeviceToHost);
    for (auto a : b) {
        std::cout << a << ",";
    }
    std::cout << std::endl;


    // --- solving!!!!

//    cusolver_status = cusolverSpScsrlsvchol(cusolver_handle, m, nnz, descrA, dcsrVal,
//                       dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
//                       &singularity);

     cusolver_status = cusolverSpScsrlsvqr(cusolver_handle, m, nnz, descrA, dcsrVal,
                        dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
                        &singularity);

    cudaDeviceSynchronize();

    std::cout << "singularity (should be -1): " << singularity << std::endl;

    std::cout << "status cusolver solving (!): " << cusolver_status << std::endl;

    cudaMemcpy(x.data(), dx, m*sizeof(float), cudaMemcpyDeviceToHost);

    // relocated these 2 lines from above to solve (2):
    cusparse_status = cusparseDestroy(cusparse_handle);
    std::cout << "status destroy cusparse handle: " << cusparse_status << std::endl;

    cusolver_status = cusolverSpDestroy(cusolver_handle);
    std::cout << "status destroy cusolver handle: " << cusolver_status << std::endl;

    for (auto a : x) {
        std::cout << a << " ";
    }
    std::cout << std::endl;



    cudaFree(db);
    cudaFree(dx);
    cudaFree(dcsrVal);
    cudaFree(dcsrColInd);
    cudaFree(dcsrRowPtr);
    cudaFree(dcooRow);

    return 0;
}

Answer 1

关于 2：我们过早地销毁了 cusparse 句柄（可能微调太多而无法找到错误源....）。此外，密集格式是列优先的，这就是为什么我们需要转置 A 以使其正确打印！

关于问题 3：cusolverSpScsrlsvlu 目前仅存在于主机上 -- 它在 6.2.1 备注 5 下以非常明显的方式写在文档中.... http://docs.nvidia.com/cuda/cusolver/index.html#cusolver-lt-t-gt-csrlsvlu

Answer 2

1.cusolverSpScsrlsvchol returns wrong results for x: 1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1

你说：

A is positive definite and symmetric.

不，不是。它不是对称的。

cusolverSpcsrlsvqr() 不要求 A 矩阵对称。

cusolverSpcsrlsvchol() 确实有这个要求：

A is an m×m symmetric postive definite sparse matrix

这是您的代码为 A 矩阵提供的打印输出：

A:
1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
0  1  0  0  0 -1  0  0  0  0  0  0  0  0  0  0
0  0  1  0  0  0 -1  0  0  0  0  0  0  0  0  0
0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
0  0  0  0  1 -1  0  0  0  0  0  0  0  0  0  0
0  0  0  0  0  4 -1  0  0 -1  0  0  0  0  0  0
0  0  0  0  0 -1  4  0  0  0 -1  0  0  0  0  0
0  0  0  0  0  0 -1  1  0  0  0  0  0  0  0  0
0  0  0  0  0  0  0  0  1 -1  0  0  0  0  0  0
0  0  0  0  0 -1  0  0  0  4 -1  0  0  0  0  0
0  0  0  0  0  0 -1  0  0 -1  4  0  0  0  0  0
0  0  0  0  0  0  0  0  0  0 -1  1  0  0  0  0
0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
0  0  0  0  0  0  0  0  0 -1  0  0  0  1  0  0
0  0  0  0  0  0  0  0  0  0 -1  0  0  0  1  0
0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1

如果那是对称的，我希望第二行：

0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0

匹配第二列：

0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0

顺便提一下关于Stack Overflow的建议。如果您回答自己的问题，我的建议是您希望它是一个完整的答案。有些人可能会看到已回答的问题并跳过它。将此类内容编辑到您的问题中可能更好，从而将您的问题（我认为）集中到一个问题上。当您每个问题问多个问题时，我认为 SO 也不起作用。这种行为使问题不必要地变得更难回答，我认为它在这里对你没有好处。

Answer 3

虽然泊松方程的笛卡尔离散化得到的矩阵不是正定的，但本题涉及稀疏正定线性系统.

的反演

与此同时，cusolverSpScsrlsvchol 可用于设备通道，我认为这对潜在感兴趣的用户执行 稀疏正定线性系统 的反演很有用cuSPARSE 库。这是一个完整的示例：

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>

#include <cuda_runtime.h>
#include <cusparse_v2.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to 
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) { exit(code); }
   }
}

extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
    switch (error)
    {

        case CUSPARSE_STATUS_SUCCESS:
            return "CUSPARSE_STATUS_SUCCESS";

        case CUSPARSE_STATUS_NOT_INITIALIZED:
            return "CUSPARSE_STATUS_NOT_INITIALIZED";

        case CUSPARSE_STATUS_ALLOC_FAILED:
            return "CUSPARSE_STATUS_ALLOC_FAILED";

        case CUSPARSE_STATUS_INVALID_VALUE:
            return "CUSPARSE_STATUS_INVALID_VALUE";

        case CUSPARSE_STATUS_ARCH_MISMATCH:
            return "CUSPARSE_STATUS_ARCH_MISMATCH";

        case CUSPARSE_STATUS_MAPPING_ERROR:
            return "CUSPARSE_STATUS_MAPPING_ERROR";

        case CUSPARSE_STATUS_EXECUTION_FAILED:
            return "CUSPARSE_STATUS_EXECUTION_FAILED";

        case CUSPARSE_STATUS_INTERNAL_ERROR:
            return "CUSPARSE_STATUS_INTERNAL_ERROR";

        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

        case CUSPARSE_STATUS_ZERO_PIVOT:
            return "CUSPARSE_STATUS_ZERO_PIVOT";
    }

    return "<unknown>";
}

inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
    if(CUSPARSE_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
                                _cusparseGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }

/********/
/* MAIN */
/********/
int main()
{
    // --- Initialize cuSPARSE
    cusparseHandle_t handle;    cusparseSafeCall(cusparseCreate(&handle));

    const int Nrows = 4;                        // --- Number of rows
    const int Ncols = 4;                        // --- Number of columns
    const int N     = Nrows;

    // --- Host side dense matrix
    double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));

    // --- Column-major ordering
    h_A_dense[0] = 0.4612f;  h_A_dense[4] = -0.0006f;   h_A_dense[8]  = 0.3566f; h_A_dense[12] = 0.0f; 
    h_A_dense[1] = -0.0006f; h_A_dense[5] = 0.4640f;    h_A_dense[9]  = 0.0723f; h_A_dense[13] = 0.0f; 
    h_A_dense[2] = 0.3566f;  h_A_dense[6] = 0.0723f;    h_A_dense[10] = 0.7543f; h_A_dense[14] = 0.0f; 
    h_A_dense[3] = 0.f;      h_A_dense[7] = 0.0f;       h_A_dense[11] = 0.0f;    h_A_dense[15] = 0.1f; 

    // --- Create device array and copy host array to it
    double *d_A_dense;  gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));

    // --- Descriptor for sparse matrix A
    cusparseMatDescr_t descrA;      cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseSetMatType     (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
    cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));  

    int nnz = 0;                                // --- Number of nonzero elements in dense matrix
    const int lda = Nrows;                      // --- Leading dimension of dense matrix
    // --- Device side number of nonzero elements per row
    int *d_nnzPerVector;    gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
    // --- Host side number of nonzero elements per row
    int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
    gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));

    printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
    for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
    printf("\n");

    // --- Device side dense matrix
    double *d_A;            gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
    int *d_A_RowIndices;    gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
    int *d_A_ColIndices;    gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));

    // --- Host side dense matrix
    double *h_A = (double *)malloc(nnz * sizeof(*h_A));     
    int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
    int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));

    printf("\nOriginal matrix in CSR format\n\n");
    for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

    for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);   

    // --- Allocating and defining dense host and device data vectors
    double *h_x = (double *)malloc(Nrows * sizeof(double)); 
    h_x[0] = 100.0;  h_x[1] = 200.0; h_x[2] = 400.0; h_x[3] = 500.0;

    double *d_x;        gpuErrchk(cudaMalloc(&d_x, Nrows * sizeof(double)));   
    gpuErrchk(cudaMemcpy(d_x, h_x, Nrows * sizeof(double), cudaMemcpyHostToDevice));

    /******************************************/
    /* STEP 1: CREATE DESCRIPTORS FOR L AND U */
    /******************************************/
    cusparseMatDescr_t      descr_L = 0; 
    cusparseSafeCall(cusparseCreateMatDescr (&descr_L)); 
    cusparseSafeCall(cusparseSetMatIndexBase    (descr_L, CUSPARSE_INDEX_BASE_ONE)); 
    cusparseSafeCall(cusparseSetMatType     (descr_L, CUSPARSE_MATRIX_TYPE_GENERAL)); 
    cusparseSafeCall(cusparseSetMatFillMode (descr_L, CUSPARSE_FILL_MODE_LOWER)); 
    cusparseSafeCall(cusparseSetMatDiagType (descr_L, CUSPARSE_DIAG_TYPE_NON_UNIT)); 

    /********************************************************************************************************/
    /* STEP 2: QUERY HOW MUCH MEMORY USED IN CHOLESKY FACTORIZATION AND THE TWO FOLLOWING SYSTEM INVERSIONS */
    /********************************************************************************************************/
    csric02Info_t info_A  = 0;  cusparseSafeCall(cusparseCreateCsric02Info(&info_A)); 
    csrsv2Info_t  info_L  = 0;  cusparseSafeCall(cusparseCreateCsrsv2Info (&info_L)); 
    csrsv2Info_t  info_Lt = 0;  cusparseSafeCall(cusparseCreateCsrsv2Info (&info_Lt)); 

    int pBufferSize_M, pBufferSize_L, pBufferSize_Lt;
    cusparseSafeCall(cusparseDcsric02_bufferSize(handle, N, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, info_A, &pBufferSize_M)); 
    cusparseSafeCall(cusparseDcsrsv2_bufferSize (handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_L,  &pBufferSize_L)); 
    cusparseSafeCall(cusparseDcsrsv2_bufferSize (handle, CUSPARSE_OPERATION_TRANSPOSE,     N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_Lt, &pBufferSize_Lt)); 

    int pBufferSize = max(pBufferSize_M, max(pBufferSize_L, pBufferSize_Lt)); 
    void *pBuffer = 0;  gpuErrchk(cudaMalloc((void**)&pBuffer, pBufferSize)); 

    /******************************************************************************************************/
    /* STEP 3: ANALYZE THE THREE PROBLEMS: CHOLESKY FACTORIZATION AND THE TWO FOLLOWING SYSTEM INVERSIONS */
    /******************************************************************************************************/
    int structural_zero; 

    cusparseSafeCall(cusparseDcsric02_analysis(handle, N, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, info_A, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer)); 

    cusparseStatus_t status = cusparseXcsric02_zeroPivot(handle, info_A, &structural_zero); 
    if (CUSPARSE_STATUS_ZERO_PIVOT == status){ printf("A(%d,%d) is missing\n", structural_zero, structural_zero); } 

    cusparseSafeCall(cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_L,  CUSPARSE_SOLVE_POLICY_NO_LEVEL,  pBuffer)); 
    cusparseSafeCall(cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE,     N, nnz, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_Lt, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer)); 

    /*************************************/
    /* STEP 4: FACTORIZATION: A = L * L' */
    /*************************************/
    int numerical_zero; 

    cusparseSafeCall(cusparseDcsric02(handle, N, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, info_A, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer)); 
    status = cusparseXcsric02_zeroPivot(handle, info_A, &numerical_zero); 
    if (CUSPARSE_STATUS_ZERO_PIVOT == status){ printf("L(%d,%d) is zero\n", numerical_zero, numerical_zero); } 

    printf("\nNon-zero elements in Cholesky matrix\n\n");
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost));
    for (int k=0; k<nnz; k++) printf("%f\n", h_A[k]);

    cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_A_dense, Nrows));

    printf("\nCholesky matrix\n\n");
    for(int i = 0; i < Nrows; i++) {
        std::cout << "[ ";
        for(int j = 0; j < Ncols; j++)
            std::cout << h_A_dense[i * Ncols + j] << " ";
        std::cout << "]\n";
    }

    /*********************/
    /* STEP 5: L * z = x */
    /*********************/
    // --- Allocating the intermediate result vector
    double *d_z;        gpuErrchk(cudaMalloc(&d_z, N * sizeof(double))); 

    const double alpha = 1.; 
    cusparseSafeCall(cusparseDcsrsv2_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nnz, &alpha, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_L, d_x, d_z, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer)); 

    /**********************/
    /* STEP 5: L' * y = z */
    /**********************/
    // --- Allocating the host and device side result vector
    double *h_y = (double *)malloc(Ncols * sizeof(double)); 
    double *d_y;        gpuErrchk(cudaMalloc(&d_y, Ncols * sizeof(double))); 

    cusparseSafeCall(cusparseDcsrsv2_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, nnz, &alpha, descr_L, d_A, d_A_RowIndices, d_A_ColIndices, info_Lt, d_z, d_y, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer)); 

    cudaMemcpy(h_x, d_y, N * sizeof(double), cudaMemcpyDeviceToHost);
    printf("\n\nFinal result\n");
    for (int k=0; k<N; k++) printf("x[%i] = %f\n", k, h_x[k]);
}

Answer 4

求解稀疏正定线性系统的另一种可能性是使用 cuSOLVER 库，特别是 cusolverSpDcsrlsvchol 例程。它的工作原理与用于 Solving general sparse linear systems in CUDA 的 cuSOLVER 例程非常相似，但使用 Cholesky 分解 A = G * G^H，其中 G 是 Cholesky 因子，一个下三角矩阵。

至于Solving general sparse linear systems in CUDA和CUDA 10.0之后的例程，目前只有主播频道可用。请注意，reorder 参数无效，如果矩阵 A 是正定矩阵，则 singularity 为 -1。

下面是一个完整的示例：

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include <cusparse.h>
#include <cusolverSp.h>

//https://www.physicsforums.com/threads/all-the-ways-to-build-positive-definite-matrices.561438/
//https://it.mathworks.com/matlabcentral/answers/101132-how-do-i-determine-if-a-matrix-is-positive-definite-using-matlab

/*******************/
/* iDivUp FUNCTION */
/*******************/
//extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__host__ __device__ int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to 
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { exit(code); }
    }
}

extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cusolverGetErrorEnum(cusolverStatus_t error)
{
    switch (error)
    {
    case CUSOLVER_STATUS_SUCCESS:
        return "CUSOLVER_SUCCESS";

    case CUSOLVER_STATUS_NOT_INITIALIZED:
        return "CUSOLVER_STATUS_NOT_INITIALIZED";

    case CUSOLVER_STATUS_ALLOC_FAILED:
        return "CUSOLVER_STATUS_ALLOC_FAILED";

    case CUSOLVER_STATUS_INVALID_VALUE:
        return "CUSOLVER_STATUS_INVALID_VALUE";

    case CUSOLVER_STATUS_ARCH_MISMATCH:
        return "CUSOLVER_STATUS_ARCH_MISMATCH";

    case CUSOLVER_STATUS_EXECUTION_FAILED:
        return "CUSOLVER_STATUS_EXECUTION_FAILED";

    case CUSOLVER_STATUS_INTERNAL_ERROR:
        return "CUSOLVER_STATUS_INTERNAL_ERROR";

    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
        return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    }

    return "<unknown>";
}

inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
    if (CUSOLVER_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSOLVE error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \
            _cusolverGetErrorEnum(err)); \
            assert(0); \
    }
}

extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
    switch (error)
    {

    case CUSPARSE_STATUS_SUCCESS:
        return "CUSPARSE_STATUS_SUCCESS";

    case CUSPARSE_STATUS_NOT_INITIALIZED:
        return "CUSPARSE_STATUS_NOT_INITIALIZED";

    case CUSPARSE_STATUS_ALLOC_FAILED:
        return "CUSPARSE_STATUS_ALLOC_FAILED";

    case CUSPARSE_STATUS_INVALID_VALUE:
        return "CUSPARSE_STATUS_INVALID_VALUE";

    case CUSPARSE_STATUS_ARCH_MISMATCH:
        return "CUSPARSE_STATUS_ARCH_MISMATCH";

    case CUSPARSE_STATUS_MAPPING_ERROR:
        return "CUSPARSE_STATUS_MAPPING_ERROR";

    case CUSPARSE_STATUS_EXECUTION_FAILED:
        return "CUSPARSE_STATUS_EXECUTION_FAILED";

    case CUSPARSE_STATUS_INTERNAL_ERROR:
        return "CUSPARSE_STATUS_INTERNAL_ERROR";

    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
        return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    case CUSPARSE_STATUS_ZERO_PIVOT:
        return "CUSPARSE_STATUS_ZERO_PIVOT";
    }

    return "<unknown>";
}

inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
    if (CUSPARSE_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs", __FILE__, __LINE__, err, \
            _cusparseGetErrorEnum(err)); \
            cudaDeviceReset(); assert(0); \
    }
}

extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }

/********/
/* MAIN */
/********/
int main()
{
    // --- Initialize cuSPARSE
    cusparseHandle_t handle;    cusparseSafeCall(cusparseCreate(&handle));

    const int Nrows = 4;                        // --- Number of rows
    const int Ncols = 4;                        // --- Number of columns
    const int N = Nrows;

    // --- Host side dense matrix
    double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));

    // --- Column-major ordering
    h_A_dense[0] =  1.78;   h_A_dense[4] = 0.0;    h_A_dense[8]  =  0.1736; h_A_dense[12] = 0.0;
    h_A_dense[1] =  0.00;   h_A_dense[5] = 3.1;    h_A_dense[9]  =  0.0;    h_A_dense[13] = 0.0;
    h_A_dense[2] =  0.1736; h_A_dense[6] = 0.0;    h_A_dense[10] =  5.0;    h_A_dense[14] = 0.0;
    h_A_dense[3] =  0.00;   h_A_dense[7] = 0.0;    h_A_dense[11] =  0.0;    h_A_dense[15] = 2.349;

    //create device array and copy host to it
    double *d_A_dense;  gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));

    // --- Descriptor for sparse matrix A
    cusparseMatDescr_t descrA;      cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);

    int nnz = 0;                                // --- Number of nonzero elements in dense matrix
    const int lda = Nrows;                      // --- Leading dimension of dense matrix
    // --- Device side number of nonzero elements per row
    int *d_nnzPerVector;    gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
    // --- Host side number of nonzero elements per row
    int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
    gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));

    printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
    for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
    printf("\n");

    // --- Device side dense matrix
    double *d_A;            gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
    int *d_A_RowIndices;    gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
    int *d_A_ColIndices;    gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));

    // --- Host side dense matrix
    double *h_A = (double *)malloc(nnz * sizeof(*h_A));
    int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
    int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));

    for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");

    for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

    for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);

    // --- Allocating and defining dense host and device data vectors
    double *h_y = (double *)malloc(Nrows * sizeof(double));
    h_y[0] = 1.0;  h_y[1] = 1.0; h_y[2] = 1.0; h_y[3] = 1.0;

    double *d_y;        gpuErrchk(cudaMalloc(&d_y, Nrows * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_y, h_y, Nrows * sizeof(double), cudaMemcpyHostToDevice));

    // --- Allocating the host and device side result vector
    double *h_x = (double *)malloc(Ncols * sizeof(double));
    double *d_x;        gpuErrchk(cudaMalloc(&d_x, Ncols * sizeof(double)));

    // --- CUDA solver initialization
    cusolverSpHandle_t solver_handle;
    cusolverSpCreate(&solver_handle);

    // --- Using Cholesky factorization
    int singularity;
    cusolveSafeCall(cusolverSpDcsrlsvcholHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));

    printf("Showing the results...\n");
    for (int i = 0; i < N; i++) printf("%f\n", h_x[i]);

}

在 CUDA 中求解稀疏正定线性系统

Solving sparse definite positive linear systems in CUDA

c++

cuda

linear-algebra

solver

cusolver