执行大量 4x4 矩阵求逆 - PyCuda

Question

我正在寻找 Python 的解决方案来执行矩阵求逆。我认为应该有一种方法可以使用 CUBLAS 或 MAGMA 以批处理或并发模式执行这些操作，因为每个矩阵都独立于其他矩阵。

所以我正在寻找针对这个特定问题的反馈，看看 CUBLAS 或 MAGMA 是否有执行此批处理或并行执行的解决方案。

我认为这里提出的计算对于 GPU 来说应该是理想的。

我必须找到范围为 (integ_prec,integ_prec) 的 2D 范围内核，其中内核对给定的全局项执行 4x4 矩阵求逆。

如何实现这个内核代码？我已经测试了 NVIDIA 开发人员提供的 batch_solver，但我无法让它工作。

更新 1

为了回答@Robert Crovella，我尝试使用 BatchSolver 来自 NVIDIA 开发人员（版本 BatchedSolver_v1_1）。

你可以在下面看到我在编译过程中收到的警告：

$ make
nvcc -O3  -arch=sm_35 -DKEPLER2  -o example_batch_solver example.c solve.cu inverse.cu
In file included from solve.cu:41:
./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
#if !defined(OPERATIONS_H_)
 ^~
./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
#define OPERATIONS_SOLVE_H_
        ^~~~~~~~~~~~~~~~~~~
        OPERATIONS_H_
1 warning generated.
In file included from solve.cu:41:
./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
#if !defined(OPERATIONS_H_)
 ^~
./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
#define OPERATIONS_SOLVE_H_
        ^~~~~~~~~~~~~~~~~~~
        OPERATIONS_H_
1 warning generated.
In file included from inverse.cu:44:
./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
#if !defined(OPERATIONS_H_)
 ^~
./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
#define OPERATIONS_SOLVE_H_
        ^~~~~~~~~~~~~~~~~~~
        OPERATIONS_H_
1 warning generated.

In file included from inverse.cu:44:
./operations.h:31:2: warning: 'OPERATIONS_H_' is used as a header guard here, followed by #define of a different macro [-Wheader-guard]
#if !defined(OPERATIONS_H_)
 ^~
./operations.h:32:9: note: 'OPERATIONS_SOLVE_H_' is defined here; did you mean 'OPERATIONS_H_'?
#define OPERATIONS_SOLVE_H_
        ^~~~~~~~~~~~~~~~~~~
        OPERATIONS_H_
1 warning generated.

不幸的是，执行给出了错误的结果：

Non-batched matrix inversion

        3.000000   1.000000   1.000000             nan  -19945373249087470322107824313046586886748897396355850773313316907920980812816123986073723926411981165664742747916855157931798956499818437291518879567207778108249202114071816066955302634366146096749274721347289725502062211559628338200162202651585616465674552041292175081655027073691104118308864.000000  -25949369271932562088528097628985580835309378491979298170251656488819244813241392783541154149164125403081303093429316785499097407170772831834462454013755392.000000
etc ...

因此，为了避免这些警告，我将宏 OPERATIONS_SOLVE_H 替换为 OPERATIONS_H_ 到 operations.h 文件中。编译期间不再出现警告，但执行时仍然出现错误结果（与上述相同）。

有人对这个 Batchsolver 有同样的问题吗（在 MacOS 10.13.5 上 NVIDIA driver 387.10.10.10.35.106 和 CUDA-10.0）？

Answer 1

正如评论中提到的，numpy 函数通常不能在 pycuda 内核代码（或 CUDA 内核代码，或 numba cuda 内核）中使用。

CUBLAS 提供 batched matrix inversion function, but it is not currently exposed in either pyculib cublas interface or scikit-cuda cublas interface.

我们可以继续实现我们自己的接口（例如使用 python ctypes），但由于已知要反转的矩阵是 4x4，我认为 talonmies 评论中的建议是一个有趣的。参考答案here，有一个相当简洁的C代码来对4x4矩阵进行直接求逆。

接下来首先是在CUDA中的实现。函数 inv4x4 是对之前代码的改编，为每个矩阵分配 16 个线程（每个矩阵元素一个）并使用该代码作为模型。每个线程负责计算一个结果矩阵元素。首先我们将它与 CUBLAS matinvBatched 进行性能比较：

$ cat t411.cu
#include <iostream>
#include <cublas_v2.h>
#include <cstdlib>
// 4x4 matrix inversion
// 

// assumes warp size is 32
// assumes block size is multiple of warp size
// therefore assumes number of matrices to be inverted (n) is even
// 16 threads per matrix to invert

const unsigned block_size = 256;
typedef float mt;

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

__device__ unsigned pat[3][16];
const unsigned hpat[3][16] = {
{ 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50},
{ 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14},
{ 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618}};

__device__ unsigned getoff(unsigned &off){
  unsigned ret = off & 0x0F;
  off = off >> 4;
  return ret;
}

const unsigned tmsk = 0xFFFFFFFF;
// in-place is acceptable i.e. out == in)
// T = float or double only
template <typename T>
__global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n){

  __shared__ T si[block_size];
  size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < n*16){
    si[threadIdx.x] = in[idx];
    unsigned lane = threadIdx.x & 15;
    unsigned sibase = threadIdx.x & 0x03F0;
    __syncwarp();
    unsigned off = pat[0][lane];
    T a,b;
    a  = si[sibase + getoff(off)];
    a *= si[sibase + getoff(off)];
    a *= si[sibase + getoff(off)];
    if (!getoff(off)) a = -a;
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    off = pat[1][lane];
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    off = pat[2][lane];
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    T det = si[sibase + (lane>>2)]*a;
    det += __shfl_down_sync(tmsk, det, 4, 16); // first add
    det += __shfl_down_sync(tmsk, det, 8, 16); // second add
    det =  __shfl_sync(tmsk, det, 0, 16); // broadcast
    out[idx] = a / det;
  }
}

size_t nr = 2048;
int main(int argc, char *argv[]){
  if (argc > 1) nr = atoi(argv[1]);

  const mt m1[] = {1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0};
  const mt i1[] = {-3.0, -0.5, 1.5, 1.0, 1.0, 0.25, -0.25, -0.5, 3.0, 0.25, -1.25, -0.5, -3.0, 0.0, 1.0, 1.0};
  const mt m2[] = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0};
  const mt i2[] = {1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0};

  mt *h_d, *d_d;
  h_d = (mt *)malloc(nr*2*16*sizeof(mt));
  cudaMalloc(&d_d, nr*2*16*sizeof(mt));
  cudaMemcpyToSymbol(pat, hpat, 3*16*sizeof(unsigned));
  for (int i = 0; i < nr; i++){
    memcpy(h_d+i*16*2, m1, sizeof(m1));
    memcpy(h_d+i*16*2+16, m2, sizeof(m2));}
  cudaMemcpy(d_d, h_d, nr*2*16*sizeof(mt), cudaMemcpyHostToDevice);
  long long t = dtime_usec(0);
  inv4x4<<<nr*2*16/block_size, block_size>>>(d_d, d_d, nr*2);
  cudaDeviceSynchronize();
  t = dtime_usec(t);
  cudaMemcpy(h_d, d_d, nr*2*16*sizeof(mt), cudaMemcpyDeviceToHost);
  for (int i = 0; i < 2; i++){
    for (int j = 0; j < 16; j++) std::cout << h_d[i*16 + j] << ",";
    std::cout << std::endl;
    for (int j = 0; j < 16; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
    std::cout << std::endl;}
  std::cout << "kernel time: " << t << " microseconds" << std::endl;
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
  //cublas
  for (int i = 0; i < nr; i++){
    memcpy(h_d+i*16*2, m1, sizeof(m1));
    memcpy(h_d+i*16*2+16, m2, sizeof(m2));}
  cudaMemcpy(d_d, h_d, nr*2*16*sizeof(mt), cudaMemcpyHostToDevice);
  cublasHandle_t h;
  cublasStatus_t cs = cublasCreate(&h);
  if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas create error" << std::endl;
  mt **A, **Ai, *Aid, **Ap, **Aip;
  A  = (mt **)malloc(nr*2*sizeof(mt *));
  Ai = (mt **)malloc(nr*2*sizeof(mt *));
  cudaMalloc(&Aid, nr*2*16*sizeof(mt));
  cudaMalloc(&Ap,  nr*2*sizeof(mt *));
  cudaMalloc(&Aip, nr*2*sizeof(mt *));
  for (int i = 0; i < nr*2; i++) A[i]  =  d_d + 16*i;
  for (int i = 0; i < nr*2; i++) Ai[i] =  Aid + 16*i;
  cudaMemcpy(Ap, A, nr*2*sizeof(mt *), cudaMemcpyHostToDevice);
  cudaMemcpy(Aip, Ai, nr*2*sizeof(mt *), cudaMemcpyHostToDevice);
  int *info;
  cudaMalloc(&info, nr*2*sizeof(int));
  t = dtime_usec(0);
  cs = cublasSmatinvBatched(h, 4,  Ap, 4, Aip, 4, info, nr*2);
  if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas matinv error" << std::endl;
  cudaDeviceSynchronize();
  t = dtime_usec(t);
  cudaMemcpy(h_d, Aid, nr*2*16*sizeof(mt), cudaMemcpyDeviceToHost);
  for (int i = 0; i < 2; i++){
    for (int j = 0; j < 16; j++) std::cout << h_d[i*16 + j] << ",";
    std::cout << std::endl;
    for (int j = 0; j < 16; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
    std::cout << std::endl;}
  std::cout << "cublas time: " << t << " microseconds" << std::endl;
  err = cudaGetLastError();
  if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
  return 0;
}
$ nvcc -o t411 t411.cu -lcublas
$ ./t411
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,-0,1,1,
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
kernel time: 49 microseconds
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
-3,-0.5,1.5,1,1,0.25,-0.25,-0.5,3,0.25,-1.25,-0.5,-3,0,1,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,
cublas time: 95 microseconds
$

我们看到代码似乎为 2 个测试矩阵求逆提供了正确的结果，在 Tesla P100 上求逆 4096 个矩阵的总时间约为 50us，比 CUBLAS 快约 2 倍。 请注意，我并未详尽测试此代码。

接下来是类似功能的简单pycuda实现。在这里，为简单起见，我们只是反转 2 个矩阵：

$ cat t10.py
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""

__device__ unsigned getoff(unsigned &off){
  unsigned ret = off & 0x0F;
  off = off >> 4;
  return ret;
}

const int block_size = 256;
const unsigned tmsk = 0xFFFFFFFF;
// in-place is acceptable i.e. out == in)
// T = float or double only
typedef float T;
__global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){

  __shared__ T si[block_size];
  size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < n*16){
    si[threadIdx.x] = in[idx];
    unsigned lane = threadIdx.x & 15;
    unsigned sibase = threadIdx.x & 0x03F0;
    __syncwarp();
    unsigned off = pat[lane];
    T a,b;
    a  = si[sibase + getoff(off)];
    a *= si[sibase + getoff(off)];
    a *= si[sibase + getoff(off)];
    if (!getoff(off)) a = -a;
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    off = pat[lane+16];
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    off = pat[lane+32];
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    b  = si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    b *= si[sibase + getoff(off)];
    if (getoff(off)) a += b;
    else a -=b;
    T det = si[sibase + (lane>>2)]*a;
    det += __shfl_down_sync(tmsk, det, 4, 16); // first add
    det += __shfl_down_sync(tmsk, det, 8, 16); // second add
    det =  __shfl_sync(tmsk, det, 0, 16); // broadcast
    out[idx] = a / det;
  }
}

""")
# python function for inverting 4x4 matrices
# n should be an even number
def gpuinv4x4(inp, n):
    # internal constants not to be modified
    hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
    # Convert parameters into numpy array
    inpd = np.array(inp, dtype=np.float32)
    hpatd = np.array(hpat, dtype=np.uint32)
    output = np.empty((n*16), dtype= np.float32)
    # Get kernel function
    matinv4x4 = kernel.get_function("inv4x4")
    # Define block, grid and compute
    blockDim = (256,1,1) # do not change
    gridDim = ((n/16)+1,1,1)
    # Kernel function
    matinv4x4 (
        cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
        block=blockDim, grid=gridDim)
    return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv4x4(inp, n)
print(result)
$ python t10.py
[-3.   -0.5   1.5   1.    1.    0.25 -0.25 -0.5   3.    0.25 -1.25 -0.5  -3.
 -0.    1.    1.    1.    0.    0.    0.    0.    1.    0.    0.    0.    0.
  1.    0.    0.    0.    0.    1.  ]
$

我创建这个 pycuda 测试用例的时间很少，所以请将其视为粗略的演示工具。

我怀疑如果您在 CUDA 中唯一需要做的就是反转这些矩阵，那么这将不是一个有趣或有吸引力的用例。我预计将数据传输到设备和 return 结果返回的成本将超过使用 GPU 与普通 numpy 相比的任何加速优势。但是我还没有测试或基准化一个 numpy 案例。

注意使用__syncwarp()表示此内核代码需要CUDA 9.0或更高版本。

另请注意，代码需要偶数个矩阵进行求逆。如果你没有偶数，用任何值填充你的数组到下一个偶数矩阵。

另请注意，代码只是假设矩阵是可逆的。没有测试来查看它们是否不是，例如，如果计算的行列式为零，则矩阵将不可逆（使用此方法）并且由于被零除，结果通常为 NaN。

不清楚这里的目的是什么，因此不应将此示例解释为暗示一般矩阵求逆是一个好主意或针对特定问题的正确解决方法。

可能更好的 pythonic 在 GPU 上求逆密集矩阵的方法是使用 cupy

执行大量 4x4 矩阵求逆 - PyCuda

Performing high number of 4x4 matrix inversion - PyCuda

python

matrix

matrix-inverse

更新 1