调整现有代码和内核代码以执行大量 3x3 矩阵求逆
Adapt existing code and Kernel code to perform a high number of 3x3 matrix inversion
继上一个问题()之后,考虑到 4x4 矩阵的求逆,我想做同样的事情,但使用 3x3 矩阵。正如@Robert Crovella 所说,此更改意味着完全重写。
鉴于下面显示的代码,我尝试测试一些东西,比如用零代替值,但这种方法似乎不起作用。
这是用于大量 4x4 矩阵求逆的代码:
$ cat t10.py
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off = off >> 4;
return ret;
}
const int block_size = 256;
const unsigned tmsk = 0xFFFFFFFF;
// in-place is acceptable i.e. out == in)
// T = float or double only
typedef float T;
__global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n*16){
si[threadIdx.x] = in[idx];
unsigned lane = threadIdx.x & 15;
unsigned sibase = threadIdx.x & 0x03F0;
__syncwarp();
unsigned off = pat[lane];
T a,b;
a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
if (!getoff(off)) a = -a;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[lane+16];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[lane+32];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
T det = si[sibase + (lane>>2)]*a;
det += __shfl_down_sync(tmsk, det, 4, 16); // first add
det += __shfl_down_sync(tmsk, det, 8, 16); // second add
det = __shfl_sync(tmsk, det, 0, 16); // broadcast
out[idx] = a / det;
}
}
""")
# python function for inverting 4x4 matrices
# n should be an even number
def gpuinv4x4(inp, n):
# internal constants not to be modified
hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
# Convert parameters into numpy array
inpd = np.array(inp, dtype=np.float32)
hpatd = np.array(hpat, dtype=np.uint32)
output = np.empty((n*16), dtype= np.float32)
# Get kernel function
matinv4x4 = kernel.get_function("inv4x4")
# Define block, grid and compute
blockDim = (256,1,1) # do not change
gridDim = ((n/16)+1,1,1)
# Kernel function
matinv4x4 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv4x4(inp, n)
print(result.reshape(2,4,4))
$ python t10.py
[[-3. -0.5 1.5 1. ]
[ 1. 0.25 -0.25 -0.5 ]
[ 3. 0.25 -1.25 -0.5 ]
[-3. -0. 1. 1. ]]
[[ 1. 0. 0. 0. ]
[ 0. 1. 0. 0. ]
[ 0. 0. 1. 0. ]
[ 0. 0. 0. 1. ]]
我期待相同的行为,除了我不再使用 4x4 矩阵而是使用 3x3 矩阵。
如何调整上面的代码以使用 3x3 矩阵求逆?
更新 1
这里是我做的修改。
我修改了维度并使用了@Robert Crovella (https://ardoris.wordpress.com/2008/07/18/general-formula-for-the-inverse-of-a-3x3-matrix/) 给出的 link 中的直接公式。下面修改代码:
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel of 3x3 inversion
kernel_3x3 = SourceModule("""
// in-place is acceptable i.e. out == in)
// T = float or double only
typedef float T;
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
size_t ix = threadIdx.x;
size_t idx = ix + blockDim.x*blockIdx.x;
if (ix < n*9){
T det = in[0+idx]*(in[4+idx]*in[8+idx]-in[7+idx]*in[5+idx]) - in[1+idx]*(in[3+idx]*in[8+idx]-in[6+idx]*in[5+idx]) + in[2+idx]*(in[3+idx]*in[7+idx]-in[6+idx]*in[4+idx]);
out[0+idx] = (in[4+idx]*in[8+idx]-in[7+idx]*in[5+idx])/det;
out[1+idx] = (in[2+idx]*in[7+idx]-in[1+idx]*in[8+idx])/det;
out[2+idx] = (in[1+idx]*in[5+idx]-in[2+idx]*in[4+idx])/det;
out[3+idx] = (in[6+idx]*in[5+idx]-in[3+idx]*in[8+idx])/det;
out[4+idx] = (in[0+idx]*in[8+idx]-in[2+idx]*in[6+idx])/det;
out[5+idx] = (in[2+idx]*in[3+idx]-in[0+idx]*in[5+idx])/det;
out[6+idx] = (in[3+idx]*in[7+idx]-in[4+idx]*in[6+idx])/det;
out[7+idx] = (in[1+idx]*in[6+idx]-in[0+idx]*in[7+idx])/det;
out[8+idx] = (in[0+idx]*in[4+idx]-in[1+idx]*in[3+idx])/det;
__syncwarp();
}
}
""")
def gpuinv3x3 (inp, n):
# internal constants not to be modified
hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
# Convert parameters into numpy array
inpd = np.array(inp, dtype=np.float32)
hpatd = np.array(hpat, dtype=np.uint32)
output = np.empty((n*9), dtype= np.float32)
# Get kernel function
matinv3x3 = kernel_3x3.get_function("inv3x3")
# Define block, grid and compute
blockDim = (81,1,1) # do not change
gridDim = ((n/9)+1,1,1)
# Kernel function
matinv3x3 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
第一个矩阵正确反转但第二个矩阵不正确(具有单位矩阵作为逆的单位矩阵):
[[[ 2. -0. -1. ]
[-1. -0.33333334 1. ]
[-0. 0.33333334 -0. ]]
[[ 1. -0.5 -0. ]
[ -inf 1. -1. ]
[ nan nan 1. ]]]
所以,这个问题似乎不是来自内核代码,而是批量大小或与全局一维数组维度类似的东西(在我的代码中,你可以看到 2 个 3x3 矩阵被格式化为一维数组18 个元素 (inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
)).
这段代码有什么问题?特别是第二个矩阵的逆反问题。最后一点,奇怪的组大小并不意味着 GPU 处理有问题?
此答案将紧跟, both in terms of answer layout and calculation method/kernel design. The formulas are described here。
首先,和以前一样,我们将展示一个 CUDA C++ 版本并与 cublas 进行比较:
$ cat t432.cu
#include <iostream>
#include <cublas_v2.h>
#include <cstdlib>
// 3x3 matrix inversion
//
// https://ardoris.wordpress.com/2008/07/18/general-formula-for-the-inverse-of-a-3x3-matrix/
// 9 threads per matrix to invert
// 32 matrices per 288 thread block
const unsigned block_size = 288;
typedef double mt;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__device__ unsigned pat[9];
const unsigned hpat[9] = {0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140};
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off >>= 4;
return ret;
}
// in-place is acceptable i.e. out == in)
// T = float or double only
template <typename T>
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
T det = 1;
if (idx < n*9)
det = in[idx];
unsigned sibase = (threadIdx.x / 9)*9;
unsigned lane = threadIdx.x - sibase; // cheaper modulo
si[threadIdx.x] = det;
__syncthreads();
unsigned off = pat[lane];
T a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
T b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
a -= b;
__syncthreads();
if (lane == 0) si[sibase+3] = a;
if (lane == 3) si[sibase+4] = a;
if (lane == 6) si[sibase+5] = a;
__syncthreads();
det = si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
if (idx < n*9)
out[idx] = a / det;
}
size_t nr = 2048;
int main(int argc, char *argv[]){
if (argc > 1) nr = atoi(argv[1]);
const mt m2[] = {1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0};
const mt i2[] = {2.0, 0.0, -1.0, -1.0, -0.33333334, 1.0, 0.0, 0.33333334, 0.0};
const mt m1[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
const mt i1[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
mt *h_d, *d_d;
h_d = (mt *)malloc(nr*9*sizeof(mt));
cudaMalloc(&d_d, nr*9*sizeof(mt));
cudaMemcpyToSymbol(pat, hpat, 9*sizeof(unsigned));
for (int i = 0; i < nr/2; i++){
memcpy(h_d+i*2*9, m1, sizeof(m1));
memcpy(h_d+i*2*9+9, m2, sizeof(m2));}
cudaMemcpy(d_d, h_d, nr*9*sizeof(mt), cudaMemcpyHostToDevice);
long long t = dtime_usec(0);
inv3x3<<<((nr*9)/block_size)+1, block_size>>>(d_d, d_d, nr);
cudaDeviceSynchronize();
t = dtime_usec(t);
cudaMemcpy(h_d, d_d, nr*9*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++){
for (int j = 0; j < 9; j++) std::cout << h_d[i*9 + j] << ",";
std::cout << std::endl;
for (int j = 0; j < 9; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
std::cout << std::endl;}
std::cout << "kernel time: " << t << " microseconds" << std::endl;
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
//cublas
for (int i = 0; i < nr/2; i++){
memcpy(h_d+i*2*9, m1, sizeof(m1));
memcpy(h_d+i*2*9+9, m2, sizeof(m2));}
cudaMemcpy(d_d, h_d, nr*9*sizeof(mt), cudaMemcpyHostToDevice);
cublasHandle_t h;
cublasStatus_t cs = cublasCreate(&h);
if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas create error" << std::endl;
mt **A, **Ai, *Aid, **Ap, **Aip;
A = (mt **)malloc(nr*sizeof(mt *));
Ai = (mt **)malloc(nr*sizeof(mt *));
cudaMalloc(&Aid, nr*9*sizeof(mt));
cudaMalloc(&Ap, nr*sizeof(mt *));
cudaMalloc(&Aip, nr*sizeof(mt *));
for (int i = 0; i < nr; i++) A[i] = d_d + 9*i;
for (int i = 0; i < nr; i++) Ai[i] = Aid + 9*i;
cudaMemcpy(Ap, A, nr*sizeof(mt *), cudaMemcpyHostToDevice);
cudaMemcpy(Aip, Ai, nr*sizeof(mt *), cudaMemcpyHostToDevice);
int *info;
cudaMalloc(&info, nr*sizeof(int));
t = dtime_usec(0);
cs = cublasDmatinvBatched(h, 3, Ap, 3, Aip, 3, info, nr);
if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas matinv error" << std::endl;
cudaDeviceSynchronize();
t = dtime_usec(t);
cudaMemcpy(h_d, Aid, nr*9*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++){
for (int j = 0; j < 9; j++) std::cout << h_d[i*9 + j] << ",";
std::cout << std::endl;
for (int j = 0; j < 9; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
std::cout << std::endl;}
std::cout << "cublas time: " << t << " microseconds" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
return 0;
}
$ nvcc -o t432 t432.cu -lcublas
$ ./t432
1,0,0,0,1,0,0,0,1,
1,0,0,0,1,0,0,0,1,
2,-0,-1,-1,-0.333333,1,-0,0.333333,-0,
2,0,-1,-1,-0.333333,1,0,0.333333,0,
kernel time: 59 microseconds
1,0,0,0,1,0,0,0,1,
1,0,0,0,1,0,0,0,1,
2,0,-1,-1,-0.333333,1,0,0.333333,0,
2,0,-1,-1,-0.333333,1,0,0.333333,0,
cublas time: 68 microseconds
$
因此,对于这个 2048 矩阵测试用例,CUDA 10.0,Tesla P100,这可能比 cublas 稍快但不多,linux。
与之前的答案类似,这里是一个简化的(只有 2 个矩阵)pycuda 测试用例:
$ cat t14.py
import numpy as np
# import matplotlib.pyplot as plt
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off >>= 4;
return ret;
}
// in-place is acceptable i.e. out == in)
// T = float or double only
const int block_size = 288;
typedef double T; // *** can set to float or double
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
T det = 1;
if (idx < n*9)
det = in[idx];
unsigned sibase = (threadIdx.x / 9)*9;
unsigned lane = threadIdx.x - sibase; // cheaper modulo
si[threadIdx.x] = det;
__syncthreads();
unsigned off = pat[lane];
T a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
T b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
a -= b;
__syncthreads();
if (lane == 0) si[sibase+3] = a;
if (lane == 3) si[sibase+4] = a;
if (lane == 6) si[sibase+5] = a;
__syncthreads();
det = si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
if (idx < n*9)
out[idx] = a / det;
}
""")
# host code
def gpuinv3x3(inp, n):
# internal constants not to be modified
hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140)
# Convert parameters into numpy array
# *** change next line between float32 and float64 to match float or double
inpd = np.array(inp, dtype=np.float64)
hpatd = np.array(hpat, dtype=np.uint32)
# *** change next line between float32 and float64 to match float or double
output = np.empty((n*9), dtype= np.float64)
# Get kernel function
matinv3x3 = kernel.get_function("inv3x3")
# Define block, grid and compute
blockDim = (288,1,1) # do not change
gridDim = ((n/32)+1,1,1)
# Kernel function
matinv3x3 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
$ python t14.py
[[[ 2. -0. -1. ]
[-1. -0.33333333 1. ]
[-0. 0.33333333 -0. ]]
[[ 1. 0. 0. ]
[ 0. 1. 0. ]
[ 0. 0. 1. ]]]
$
上面恰好在pycuda中使用了double
即float64
。将其更改为 float
即 pycuda 中的 float32
涉及更改 .
中描述的相同 3 行
继上一个问题(
鉴于下面显示的代码,我尝试测试一些东西,比如用零代替值,但这种方法似乎不起作用。
这是用于大量 4x4 矩阵求逆的代码:
$ cat t10.py
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off = off >> 4;
return ret;
}
const int block_size = 256;
const unsigned tmsk = 0xFFFFFFFF;
// in-place is acceptable i.e. out == in)
// T = float or double only
typedef float T;
__global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n*16){
si[threadIdx.x] = in[idx];
unsigned lane = threadIdx.x & 15;
unsigned sibase = threadIdx.x & 0x03F0;
__syncwarp();
unsigned off = pat[lane];
T a,b;
a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
if (!getoff(off)) a = -a;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[lane+16];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[lane+32];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
T det = si[sibase + (lane>>2)]*a;
det += __shfl_down_sync(tmsk, det, 4, 16); // first add
det += __shfl_down_sync(tmsk, det, 8, 16); // second add
det = __shfl_sync(tmsk, det, 0, 16); // broadcast
out[idx] = a / det;
}
}
""")
# python function for inverting 4x4 matrices
# n should be an even number
def gpuinv4x4(inp, n):
# internal constants not to be modified
hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
# Convert parameters into numpy array
inpd = np.array(inp, dtype=np.float32)
hpatd = np.array(hpat, dtype=np.uint32)
output = np.empty((n*16), dtype= np.float32)
# Get kernel function
matinv4x4 = kernel.get_function("inv4x4")
# Define block, grid and compute
blockDim = (256,1,1) # do not change
gridDim = ((n/16)+1,1,1)
# Kernel function
matinv4x4 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv4x4(inp, n)
print(result.reshape(2,4,4))
$ python t10.py
[[-3. -0.5 1.5 1. ]
[ 1. 0.25 -0.25 -0.5 ]
[ 3. 0.25 -1.25 -0.5 ]
[-3. -0. 1. 1. ]]
[[ 1. 0. 0. 0. ]
[ 0. 1. 0. 0. ]
[ 0. 0. 1. 0. ]
[ 0. 0. 0. 1. ]]
我期待相同的行为,除了我不再使用 4x4 矩阵而是使用 3x3 矩阵。
如何调整上面的代码以使用 3x3 矩阵求逆?
更新 1
这里是我做的修改。
我修改了维度并使用了@Robert Crovella (https://ardoris.wordpress.com/2008/07/18/general-formula-for-the-inverse-of-a-3x3-matrix/) 给出的 link 中的直接公式。下面修改代码:
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel of 3x3 inversion
kernel_3x3 = SourceModule("""
// in-place is acceptable i.e. out == in)
// T = float or double only
typedef float T;
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
size_t ix = threadIdx.x;
size_t idx = ix + blockDim.x*blockIdx.x;
if (ix < n*9){
T det = in[0+idx]*(in[4+idx]*in[8+idx]-in[7+idx]*in[5+idx]) - in[1+idx]*(in[3+idx]*in[8+idx]-in[6+idx]*in[5+idx]) + in[2+idx]*(in[3+idx]*in[7+idx]-in[6+idx]*in[4+idx]);
out[0+idx] = (in[4+idx]*in[8+idx]-in[7+idx]*in[5+idx])/det;
out[1+idx] = (in[2+idx]*in[7+idx]-in[1+idx]*in[8+idx])/det;
out[2+idx] = (in[1+idx]*in[5+idx]-in[2+idx]*in[4+idx])/det;
out[3+idx] = (in[6+idx]*in[5+idx]-in[3+idx]*in[8+idx])/det;
out[4+idx] = (in[0+idx]*in[8+idx]-in[2+idx]*in[6+idx])/det;
out[5+idx] = (in[2+idx]*in[3+idx]-in[0+idx]*in[5+idx])/det;
out[6+idx] = (in[3+idx]*in[7+idx]-in[4+idx]*in[6+idx])/det;
out[7+idx] = (in[1+idx]*in[6+idx]-in[0+idx]*in[7+idx])/det;
out[8+idx] = (in[0+idx]*in[4+idx]-in[1+idx]*in[3+idx])/det;
__syncwarp();
}
}
""")
def gpuinv3x3 (inp, n):
# internal constants not to be modified
hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
# Convert parameters into numpy array
inpd = np.array(inp, dtype=np.float32)
hpatd = np.array(hpat, dtype=np.uint32)
output = np.empty((n*9), dtype= np.float32)
# Get kernel function
matinv3x3 = kernel_3x3.get_function("inv3x3")
# Define block, grid and compute
blockDim = (81,1,1) # do not change
gridDim = ((n/9)+1,1,1)
# Kernel function
matinv3x3 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
第一个矩阵正确反转但第二个矩阵不正确(具有单位矩阵作为逆的单位矩阵):
[[[ 2. -0. -1. ]
[-1. -0.33333334 1. ]
[-0. 0.33333334 -0. ]]
[[ 1. -0.5 -0. ]
[ -inf 1. -1. ]
[ nan nan 1. ]]]
所以,这个问题似乎不是来自内核代码,而是批量大小或与全局一维数组维度类似的东西(在我的代码中,你可以看到 2 个 3x3 矩阵被格式化为一维数组18 个元素 (inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
)).
这段代码有什么问题?特别是第二个矩阵的逆反问题。最后一点,奇怪的组大小并不意味着 GPU 处理有问题?
此答案将紧跟
首先,和以前一样,我们将展示一个 CUDA C++ 版本并与 cublas 进行比较:
$ cat t432.cu
#include <iostream>
#include <cublas_v2.h>
#include <cstdlib>
// 3x3 matrix inversion
//
// https://ardoris.wordpress.com/2008/07/18/general-formula-for-the-inverse-of-a-3x3-matrix/
// 9 threads per matrix to invert
// 32 matrices per 288 thread block
const unsigned block_size = 288;
typedef double mt;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__device__ unsigned pat[9];
const unsigned hpat[9] = {0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140};
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off >>= 4;
return ret;
}
// in-place is acceptable i.e. out == in)
// T = float or double only
template <typename T>
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
T det = 1;
if (idx < n*9)
det = in[idx];
unsigned sibase = (threadIdx.x / 9)*9;
unsigned lane = threadIdx.x - sibase; // cheaper modulo
si[threadIdx.x] = det;
__syncthreads();
unsigned off = pat[lane];
T a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
T b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
a -= b;
__syncthreads();
if (lane == 0) si[sibase+3] = a;
if (lane == 3) si[sibase+4] = a;
if (lane == 6) si[sibase+5] = a;
__syncthreads();
det = si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
if (idx < n*9)
out[idx] = a / det;
}
size_t nr = 2048;
int main(int argc, char *argv[]){
if (argc > 1) nr = atoi(argv[1]);
const mt m2[] = {1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0};
const mt i2[] = {2.0, 0.0, -1.0, -1.0, -0.33333334, 1.0, 0.0, 0.33333334, 0.0};
const mt m1[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
const mt i1[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
mt *h_d, *d_d;
h_d = (mt *)malloc(nr*9*sizeof(mt));
cudaMalloc(&d_d, nr*9*sizeof(mt));
cudaMemcpyToSymbol(pat, hpat, 9*sizeof(unsigned));
for (int i = 0; i < nr/2; i++){
memcpy(h_d+i*2*9, m1, sizeof(m1));
memcpy(h_d+i*2*9+9, m2, sizeof(m2));}
cudaMemcpy(d_d, h_d, nr*9*sizeof(mt), cudaMemcpyHostToDevice);
long long t = dtime_usec(0);
inv3x3<<<((nr*9)/block_size)+1, block_size>>>(d_d, d_d, nr);
cudaDeviceSynchronize();
t = dtime_usec(t);
cudaMemcpy(h_d, d_d, nr*9*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++){
for (int j = 0; j < 9; j++) std::cout << h_d[i*9 + j] << ",";
std::cout << std::endl;
for (int j = 0; j < 9; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
std::cout << std::endl;}
std::cout << "kernel time: " << t << " microseconds" << std::endl;
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
//cublas
for (int i = 0; i < nr/2; i++){
memcpy(h_d+i*2*9, m1, sizeof(m1));
memcpy(h_d+i*2*9+9, m2, sizeof(m2));}
cudaMemcpy(d_d, h_d, nr*9*sizeof(mt), cudaMemcpyHostToDevice);
cublasHandle_t h;
cublasStatus_t cs = cublasCreate(&h);
if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas create error" << std::endl;
mt **A, **Ai, *Aid, **Ap, **Aip;
A = (mt **)malloc(nr*sizeof(mt *));
Ai = (mt **)malloc(nr*sizeof(mt *));
cudaMalloc(&Aid, nr*9*sizeof(mt));
cudaMalloc(&Ap, nr*sizeof(mt *));
cudaMalloc(&Aip, nr*sizeof(mt *));
for (int i = 0; i < nr; i++) A[i] = d_d + 9*i;
for (int i = 0; i < nr; i++) Ai[i] = Aid + 9*i;
cudaMemcpy(Ap, A, nr*sizeof(mt *), cudaMemcpyHostToDevice);
cudaMemcpy(Aip, Ai, nr*sizeof(mt *), cudaMemcpyHostToDevice);
int *info;
cudaMalloc(&info, nr*sizeof(int));
t = dtime_usec(0);
cs = cublasDmatinvBatched(h, 3, Ap, 3, Aip, 3, info, nr);
if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas matinv error" << std::endl;
cudaDeviceSynchronize();
t = dtime_usec(t);
cudaMemcpy(h_d, Aid, nr*9*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++){
for (int j = 0; j < 9; j++) std::cout << h_d[i*9 + j] << ",";
std::cout << std::endl;
for (int j = 0; j < 9; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
std::cout << std::endl;}
std::cout << "cublas time: " << t << " microseconds" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
return 0;
}
$ nvcc -o t432 t432.cu -lcublas
$ ./t432
1,0,0,0,1,0,0,0,1,
1,0,0,0,1,0,0,0,1,
2,-0,-1,-1,-0.333333,1,-0,0.333333,-0,
2,0,-1,-1,-0.333333,1,0,0.333333,0,
kernel time: 59 microseconds
1,0,0,0,1,0,0,0,1,
1,0,0,0,1,0,0,0,1,
2,0,-1,-1,-0.333333,1,0,0.333333,0,
2,0,-1,-1,-0.333333,1,0,0.333333,0,
cublas time: 68 microseconds
$
因此,对于这个 2048 矩阵测试用例,CUDA 10.0,Tesla P100,这可能比 cublas 稍快但不多,linux。
与之前的答案类似,这里是一个简化的(只有 2 个矩阵)pycuda 测试用例:
$ cat t14.py
import numpy as np
# import matplotlib.pyplot as plt
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off >>= 4;
return ret;
}
// in-place is acceptable i.e. out == in)
// T = float or double only
const int block_size = 288;
typedef double T; // *** can set to float or double
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
T det = 1;
if (idx < n*9)
det = in[idx];
unsigned sibase = (threadIdx.x / 9)*9;
unsigned lane = threadIdx.x - sibase; // cheaper modulo
si[threadIdx.x] = det;
__syncthreads();
unsigned off = pat[lane];
T a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
T b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
a -= b;
__syncthreads();
if (lane == 0) si[sibase+3] = a;
if (lane == 3) si[sibase+4] = a;
if (lane == 6) si[sibase+5] = a;
__syncthreads();
det = si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
if (idx < n*9)
out[idx] = a / det;
}
""")
# host code
def gpuinv3x3(inp, n):
# internal constants not to be modified
hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140)
# Convert parameters into numpy array
# *** change next line between float32 and float64 to match float or double
inpd = np.array(inp, dtype=np.float64)
hpatd = np.array(hpat, dtype=np.uint32)
# *** change next line between float32 and float64 to match float or double
output = np.empty((n*9), dtype= np.float64)
# Get kernel function
matinv3x3 = kernel.get_function("inv3x3")
# Define block, grid and compute
blockDim = (288,1,1) # do not change
gridDim = ((n/32)+1,1,1)
# Kernel function
matinv3x3 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
$ python t14.py
[[[ 2. -0. -1. ]
[-1. -0.33333333 1. ]
[-0. 0.33333333 -0. ]]
[[ 1. 0. 0. ]
[ 0. 1. 0. ]
[ 0. 0. 1. ]]]
$
上面恰好在pycuda中使用了double
即float64
。将其更改为 float
即 pycuda 中的 float32
涉及更改