二维 Cuda 网格内核中的 Cupy 索引?
Cupy indexing in 2D Cuda Grid Kernels?
我正在尝试开始使用 Cupy 进行一些 Cuda 编程。
我需要编写自己的内核。但是,我正在努力处理 2D 内核。看来 Cupy 并没有像我预期的那样工作。
这是 Numba Cuda 中 2D 内核的一个非常简单的示例:
import cupy as cp
from numba import cuda
@cuda.jit
def nb_add_arrs(x1, x2, y):
i, j = cuda.grid(2)
if i < y.shape[0] and j < y.shape[1]:
y[i, j] = x1[i, j] + x2[i, j]
x1 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
x2 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
y = cp.zeros((5, 5), dtype=cp.int32)
# Grid and block sizes
tpb = (16, 16)
bpg = (x1.shape[0] // tpb[0] + 1, x1.shape[1] // tpb[0] + 1)
# Call kernel
nb_add_arrs[bpg, tpb](x1, x2, y)
结果如预期:
y
[[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]]
然而,当我尝试在 Cupy 中做这个简单的内核时,我并没有得到同样的结果。
cp_add_arrs = cp.RawKernel(r'''
extern "C" __global__
void add_arrs(const float* x1, const float* x2, float* y, int N){
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
if(i < N && j < N){
y[i, j] = x1[i, j] + x2[i, j];
}
}
''', 'add_arrs')
x1 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
x2 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
y = cp.zeros((5, 5), dtype=cp.int32)
N = x1.shape[0]
# Grid and block sizes
tpb = (16, 16)
bpg = (x1.shape[0] // tpb[0] + 1, x1.shape[1] // tpb[0] + 1)
# Call kernel
cp_add_arrs(bpg, tpb, (x1, x2, y, cp.int32(N)))
y
[[0 0 0 0 0]
[0 0 0 0 0]
[0 0 0 0 0]
[0 0 0 0 0]
[0 0 0 0 0]]
谁能帮我弄清楚为什么?
C 中的内存以行优先顺序存储。所以,我们需要按照这个顺序进行索引。此外,由于我传递的是 int 数组,因此我更改了内核的参数类型。这是代码:
cp_add_arrs = cp.RawKernel(r'''
extern "C" __global__
void add_arrs(int* x1, int* x2, int* y, int N){
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
if(i < N && j < N){
y[j + i*N] = x1[j + i*N] + x2[j + i*N];
}
}
''', 'add_arrs')
x1 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
x2 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
y = cp.zeros((5, 5), dtype=cp.int32)
N = x1.shape[0]
# Grid and block sizes
tpb = (16, 16)
bpg = (x1.shape[0] // tpb[0] + 1, x1.shape[1] // tpb[0] + 1)
# Call kernel
cp_add_arrs(bpg, tpb, (x1, x2, y, cp.int32(N)))
y
array([[2, 2, 2, 2, 2],
[2, 2, 2, 2, 2],
[2, 2, 2, 2, 2],
[2, 2, 2, 2, 2],
[2, 2, 2, 2, 2]], dtype=int32)
可能比较迷惑的是只有块数和线程数。
网格维度 (bx,by,bz) --> 块[bx,by,bz]
块维度 (tx,ty,tz) ---> 线程[tx,ty,tz]
例如一个线程块[a,b,c][d,e,f]
哪个可以映射到计数器K看例子。 Fiddle 如果你喜欢的话:
matmul = cp.RawKernel(r'''
extern "C" __global__
void matmul_kernel(float *A, float *B, float *C) {
int K = threadIdx.x
+ blockDim.x * threadIdx.y
+ blockDim.x * blockDim.y * threadIdx.z
+ blockDim.x * blockDim.y * blockDim.z * blockIdx.x
+ blockDim.x * blockDim.y * blockDim.z * gridDim.x * blockIdx.y
+ blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * blockIdx.z ;
// Actually there is blockDim and threadDim all xyz there can only one itterator with this.
if (K<10000) {
if (K%9==0) {C[K]=gridDim.x;}
if (K%9==1) {C[K]=gridDim.y;}
if (K%9==2) {C[K]=gridDim.z;}
if (K%9==3) {C[K]=blockDim.x;}
if (K%9==4) {C[K]=blockDim.y;}
if (K%9==5) {C[K]=blockDim.z;}
if (K%9==6) {C[K]=threadIdx.x;}
if (K%9==7) {C[K]=threadIdx.y;}
if (K%9==8) {C[K]=threadIdx.z;}
C[K]=K;
}
}
''', 'matmul_kernel')
x1 = cp.random.random(10000, dtype=cp.float32)
x2 = cp.random.random(10000, dtype=cp.float32)
y = cp.zeros((10000), dtype=cp.float32)
matmul((32,32,32,), (10,10,10,), (x1, x2, y)) # grid, block and arguments # first of all blocks are huge but threads.xyz are limited to total 1024 10,10,10
max(y)
我正在尝试开始使用 Cupy 进行一些 Cuda 编程。 我需要编写自己的内核。但是,我正在努力处理 2D 内核。看来 Cupy 并没有像我预期的那样工作。 这是 Numba Cuda 中 2D 内核的一个非常简单的示例:
import cupy as cp
from numba import cuda
@cuda.jit
def nb_add_arrs(x1, x2, y):
i, j = cuda.grid(2)
if i < y.shape[0] and j < y.shape[1]:
y[i, j] = x1[i, j] + x2[i, j]
x1 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
x2 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
y = cp.zeros((5, 5), dtype=cp.int32)
# Grid and block sizes
tpb = (16, 16)
bpg = (x1.shape[0] // tpb[0] + 1, x1.shape[1] // tpb[0] + 1)
# Call kernel
nb_add_arrs[bpg, tpb](x1, x2, y)
结果如预期:
y
[[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]
[2 2 2 2 2]]
然而,当我尝试在 Cupy 中做这个简单的内核时,我并没有得到同样的结果。
cp_add_arrs = cp.RawKernel(r'''
extern "C" __global__
void add_arrs(const float* x1, const float* x2, float* y, int N){
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
if(i < N && j < N){
y[i, j] = x1[i, j] + x2[i, j];
}
}
''', 'add_arrs')
x1 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
x2 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
y = cp.zeros((5, 5), dtype=cp.int32)
N = x1.shape[0]
# Grid and block sizes
tpb = (16, 16)
bpg = (x1.shape[0] // tpb[0] + 1, x1.shape[1] // tpb[0] + 1)
# Call kernel
cp_add_arrs(bpg, tpb, (x1, x2, y, cp.int32(N)))
y
[[0 0 0 0 0]
[0 0 0 0 0]
[0 0 0 0 0]
[0 0 0 0 0]
[0 0 0 0 0]]
谁能帮我弄清楚为什么?
C 中的内存以行优先顺序存储。所以,我们需要按照这个顺序进行索引。此外,由于我传递的是 int 数组,因此我更改了内核的参数类型。这是代码:
cp_add_arrs = cp.RawKernel(r'''
extern "C" __global__
void add_arrs(int* x1, int* x2, int* y, int N){
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
if(i < N && j < N){
y[j + i*N] = x1[j + i*N] + x2[j + i*N];
}
}
''', 'add_arrs')
x1 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
x2 = cp.ones(25, dtype=cp.int32).reshape(5, 5)
y = cp.zeros((5, 5), dtype=cp.int32)
N = x1.shape[0]
# Grid and block sizes
tpb = (16, 16)
bpg = (x1.shape[0] // tpb[0] + 1, x1.shape[1] // tpb[0] + 1)
# Call kernel
cp_add_arrs(bpg, tpb, (x1, x2, y, cp.int32(N)))
y
array([[2, 2, 2, 2, 2],
[2, 2, 2, 2, 2],
[2, 2, 2, 2, 2],
[2, 2, 2, 2, 2],
[2, 2, 2, 2, 2]], dtype=int32)
可能比较迷惑的是只有块数和线程数。
网格维度 (bx,by,bz) --> 块[bx,by,bz]
块维度 (tx,ty,tz) ---> 线程[tx,ty,tz]
例如一个线程块[a,b,c][d,e,f]
哪个可以映射到计数器K看例子。 Fiddle 如果你喜欢的话:
matmul = cp.RawKernel(r'''
extern "C" __global__
void matmul_kernel(float *A, float *B, float *C) {
int K = threadIdx.x
+ blockDim.x * threadIdx.y
+ blockDim.x * blockDim.y * threadIdx.z
+ blockDim.x * blockDim.y * blockDim.z * blockIdx.x
+ blockDim.x * blockDim.y * blockDim.z * gridDim.x * blockIdx.y
+ blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y * blockIdx.z ;
// Actually there is blockDim and threadDim all xyz there can only one itterator with this.
if (K<10000) {
if (K%9==0) {C[K]=gridDim.x;}
if (K%9==1) {C[K]=gridDim.y;}
if (K%9==2) {C[K]=gridDim.z;}
if (K%9==3) {C[K]=blockDim.x;}
if (K%9==4) {C[K]=blockDim.y;}
if (K%9==5) {C[K]=blockDim.z;}
if (K%9==6) {C[K]=threadIdx.x;}
if (K%9==7) {C[K]=threadIdx.y;}
if (K%9==8) {C[K]=threadIdx.z;}
C[K]=K;
}
}
''', 'matmul_kernel')
x1 = cp.random.random(10000, dtype=cp.float32)
x2 = cp.random.random(10000, dtype=cp.float32)
y = cp.zeros((10000), dtype=cp.float32)
matmul((32,32,32,), (10,10,10,), (x1, x2, y)) # grid, block and arguments # first of all blocks are huge but threads.xyz are limited to total 1024 10,10,10
max(y)