cublasDgetrfBatched 和 cublasDtrsmBatched 在使用 cuBLAS 求解线性系统数组时的矛盾

Contradiction of cublasDgetrfBatched and cublasDtrsmBatched when to solve array of linear systems using cuBLAS

我有很多密集的线性系统,我想用 cuBLAS Batched 格式求解。所以我的计划是

  1. 使用cublasDgetrfBatched进行批量LU分解

  2. 然后用cublasDtrsmBatched对batched lower triangle和batched upper triangle部分一一进行。

代码为

 #include<stdio.h>
 #include<stdlib.h>
 #include<cuda_runtime.h> 
 #include<device_launch_parameters.h>
 #include<cublas_v2.h>

 const int N = 32;
 const int Nmatrices = N;

 __global__ void initiate(double *d_A, double *d_B)
    {
     int i = threadIdx.x;       int j = blockIdx.x;

     int id = j*N*N + i*N;      int idb = j*N + i;  

     for(int k = 0; k< N ; k++)
         {
        d_A[id + k] = 0.0;

        if(k == i-2)    d_A[id + k] = 1.0; 
        if(k == i-1)    d_A[id + k] = 2.0; 
        if(k == i)      d_A[id + k] = 8.0;
        if(k == i+1)    d_A[id + k] = 2.0;
        if(k == i+2)    d_A[id + k] = 1.0;  
        } 
     d_B[idb] = 8.0;   
    }


int main()
 {
    cublasHandle_t handle;      cublasSafeCall(cublasCreate(&handle));

// Allocate device space for the input matrices
  double *d_A_sys; cudaMalloc((void**)&d_A_sys, N*N*Nmatrices*sizeof(double));
  double *d_B_sys; cudaMalloc((void**)&d_B_sys, N*Nmatrices  *sizeof(double));

// Allocate host space for the solution
  double *h_B_sys = (double *)malloc(N*Nmatrices*sizeof(double));

// kernel for initiat d_A_sys and d_B_sys
  initiate<<<Nmatrices, N>>>(d_A_sys, d_B_sys);

//Creating the array of pointers needed as input/output to the batched getrf
  double **h_A_pointers = (double **)malloc(Nmatrices*sizeof(double *));
  for (int i = 0; i < Nmatrices; i++) h_A_pointers[i] = d_A_sys + i*N*N;

  double **h_b_pointers = (double **)malloc(Nmatrices*sizeof(double *));
  for (int i = 0; i < Nmatrices; i++) h_B_pointers[i] = d_B_sys + i*N;

  double **d_A_pointers;
  cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));
  cudaMemcpy(d_A_pointers, h_A_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);


  double **d_b_pointers;
  cudaMalloc((void**)&d_b_pointers, Nmatrices*sizeof(double *));
  cudaMemcpy(d_b_pointers, h_b_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice);

  int *d_InfoArrays; cudaMalloc((void**)&d_InfoArrays,  Nmatrices*sizeof(int));
  int *h_InfoArrays = (int *)malloc(Nmatrices*sizeof(int));

//Batched LU decomposition
   cublasDgetrfBatched(handle, N, d_A_pointers, N, NULL, d_InfoArrays, Nmatrices));

  //Batched Lower triangular part
  cublasDtrsmBatched(handle, 
                   CUBLAS_SIDE_LEFT, 
                   CUBLAS_FILL_MODE_LOWER,
                   CUBLAS_OP_N,
                   CUBLAS_DIAG_UNIT,
                   N,
                   N,
                   &alpha,
                   d_A_pointers,
                   N,
                   d_b_pointers,
                   N,
                   Nmatrices);

  //Batched Upper triangular part
  cublasDtrsmBatched(handle,
                   CUBLAS_SIDE_LEFT,
                   CUBLAS_FILL_MODE_UPPER,
                   CUBLAS_OP_N,
                   CUBLAS_DIAG_NON_UNIT,
                   N,
                   N,
                   &alpha,
                   d_A_pointers,
                   N,
                   d_b_pointers,
                   N,
                   Nmatrices);

  cudaMemcpy(h_B_sys, d_B_sys, N*Nmatrices*sizeof(double), cudaMemcpyDeviceToHost);
  printf("Print out the solutions \n");

  cublasDestroy(handle);
  gpuErrchk(cudaDeviceReset());
  return 0;
 }

cublasDgetrfBatched and cublasDtrsmBatched demand d_A_pointers should be in double type but when I execute, the later one giving me compiling error like this see the pic:

如何克服这个问题,有帮助吗?

您可以通过执行以下操作来解决 const 正确性问题:

const double **d_A_pointers;
cudaMalloc((void**)&d_A_pointers, Nmatrices*sizeof(double *));

....

//Batched LU decomposition
cublasDgetrfBatched(handle, N, const_cast<double**>(d_A_pointers), N, NULL, d_InfoArrays, Nmatrices);

即抛弃 cublasDgetrfBatched

中的常量

您在发布的代码中显然有误的另一件事是 cublasDtrsmBatched 调用中输入的维度。我相信你想要这样的东西:

//Batched Lower triangular part
cublasDtrsmBatched(handle, 
        CUBLAS_SIDE_LEFT, 
        CUBLAS_FILL_MODE_LOWER,
        CUBLAS_OP_N,
        CUBLAS_DIAG_UNIT,
        N,
        1,
        &alpha,
        d_A_pointers,
        N,
        d_b_pointers,
        N,
        Nmatrices);

即您的示例的 RHS 矩阵的输入大小是 Nx1,而不是 NxN(您没有为每个分解矩阵解决 N 个 RHS 问题,只有一个)。

您的代码中可能存在其他问题(请注意,CUBLAS 与大多数参考 BLAS 实现一样,默认情况下需要按列主要顺序输入),并且由于其他原因,您在问题中发布的代码实际上并未编译,所以不能肯定地说更多。