批处理复杂线性系统求解器上的 cuBLAS 性能问题

issues of cuBLAS performance on batched complex linear system solver

我是 cuda 和 cuBlas 的新手,最近我正在尝试使用批处理 cuBlas API 来求解多个线性方程组。这是我的代码:

矩阵的大小为N,矩阵的个数(batch size)为numOfMat。

#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
#include <cuComplex.h>      // deal with complex numbers
#include <cuda_profiler_api.h>
    
using namespace std::chrono;

#define N 6
#define numOfMat 500000
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main() {
    std::random_device device;
    std::mt19937 generator(device());
    std::uniform_real_distribution<double> distribution(1., 5.);
    high_resolution_clock::time_point t1;
    high_resolution_clock::time_point t2;
    double duration = 0;
    double duration_1 = 0;

    // step 1: cuda solver initialization
    cublasHandle_t cublas_handle;
    cublasCreate_v2(&cublas_handle);
    cublasStatus_t stat;

    int* PivotArray;
    int* infoArray;
    cudaError_t cudaStatUnified1 = cudaSuccess;
    cudaError_t cudaStatUnified2 = cudaSuccess;
    const cuDoubleComplex alpha = make_cuDoubleComplex(1.0f, 0.0f);
    cudaStatUnified1 = cudaMallocManaged(&PivotArray, N * numOfMat * sizeof(int));
    cudaStatUnified2 = cudaMallocManaged(&infoArray, numOfMat * sizeof(int));
    if ((cudaSuccess != cudaStatUnified1) || (cudaSuccess != cudaStatUnified2))
        std::cout <<"unified memory allocated unsuccessful!"<<std::endl;

    //ALLOCATE MEMORY - using unified memory
    cuDoubleComplex** h_A;
    cudaMallocManaged(&h_A, sizeof(cuDoubleComplex*) * numOfMat);
    cudaMallocManaged(&(h_A[0]), sizeof(cuDoubleComplex)*numOfMat*N*N);
    for (int nm = 1; nm < numOfMat; nm++)
        h_A[nm] = h_A[nm-1]+ N * N;

    cuDoubleComplex** h_b;
    cudaMallocManaged(&h_b, sizeof(cuDoubleComplex*) * numOfMat);
    cudaMallocManaged(&(h_b[0]), sizeof(cuDoubleComplex) * numOfMat * N);
    for (int nm = 1; nm < numOfMat; nm++)
        h_b[nm] = h_b[nm-1] + N;

    // FILL MATRICES
    for (int nm = 0; nm < numOfMat; nm++)
      for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
          h_A[nm][j * N + i] = make_cuDoubleComplex(distribution(generator), distribution(generator));

    // FILL COEFFICIENTS
    for (int nm = 0; nm < numOfMat; nm++)
      for (int i = 0; i < N; i++)
        h_b[nm][i] = make_cuDoubleComplex(distribution(generator), distribution(generator));

    t1 = high_resolution_clock::now();

    // step 2: Perform CUBLAS LU solver
    stat = cublasZgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numOfMat);
    if (stat != CUBLAS_STATUS_SUCCESS) printf ("-data download failed");
    gpuErrchk( cudaDeviceSynchronize() );
    // check if the input matrix is singular
    /*for (int i = 0; i < numOfMat; i++)
      if (infoArray[i] != 0) {
        fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
      }*/

    // step 3: INVERT UPPER AND LOWER TRIANGULAR MATRICES
    // --- Function solves the triangular linear system with multiple RHSs
    // --- Function overrides b as a result
    stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
    if (stat != CUBLAS_STATUS_SUCCESS) printf ("--data download failed");
    gpuErrchk( cudaDeviceSynchronize() );

    stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
    if (stat != CUBLAS_STATUS_SUCCESS) printf ("---data download failed");
    gpuErrchk( cudaDeviceSynchronize() );

    t2 = high_resolution_clock::now();
    duration = duration_cast<microseconds>(t2 - t1).count();
    std::cout<<duration<<std::endl;
}

代码运行良好,但是当我绘制计算时间与矩阵数量的关系曲线时,曲线如下所示:

我的问题是:为什么计算时间与矩阵的数量成线性关系?直观上,当批量大小在某种程度上很大时,曲线应该看起来是平坦的。但是,当批大小达到 500,000 时,时间仍然与批大小呈线性关系。

怎么可能?这种情况有什么解释吗?

我认为您需要更仔细地查看您的数据。如果我 运行 在 Google Colab (Tesla T4) 上修改你的代码,我会得到这个:

这看起来很像你的身材。但仔细观察(对数刻度帮助):

您可以清楚地看到,到某个点为止,运行时间在很大程度上与矩阵的数量无关(大约 2^8 = 64),但是随着尺寸的增加,缩放比例是线性的。这是从能够并行化工作负载到达到并行容量并且必须安排许多并行操作组来执行工作负载的过渡。您可能会推断,对于这个特定的 GPU,GPU 运行 在 64 到 128 个并发操作之间的并行容量不足(T4 有 40 个 SM,所以如果一个 SM 可以同时容纳每个 SM 的 2 个操作,它可能是 80 个) ),之后 运行 时间尺度是该限制大小的倍数。

对于我熟悉的任何并行计算架构,这都是完全正常的行为。