PyCUDA Kernel returns 具体计算的除法结果不一致
PyCUDA Kernel returns inconsistent division result for specific calculations
我正在尝试实现一个计算百分比平均值的内核。
示例-取 3D 数组(在下面的代码中)片 [[2,4],[3,6],[4,8]]
并计算 (4+6+8)/((4+6+8)+(2+3+4))
这是一个 colab notebook,可以快速 运行 以下代码:https://colab.research.google.com/drive/1k_XfOVOYWOTnNQFA9Vo_H93D9l-xWO8K?usp=sharing
# -*- coding: utf-8 -*-
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
# set dimentions
ROWS = 3
COLS = 2
h_perms = np.array([[
[ 1,1],
[ 1,1],
[ 1,1]
],[
[ 2,7],
[ 3,11],
[ 4,13]
],[
[ 2,4],
[ 3,6],
[ 4,8]
],[
[ 2,7],
[ 3,11],
[ 4,13]
],[
[ 2,4],
[ 3,6],
[ 4,8]
],[
[ 1,1],
[ 1,1],
[ 1,1]
]
], dtype=np.float32).flatten()
# send to device
d_perms = gpuarray.to_gpu(h_perms)
kernel = SourceModule("""
__global__
void calc(float *permutations, int *permutationShape, float *results)
{
__shared__ float c;
__shared__ float b;
int bIdx = blockIdx.y * gridDim.x + blockIdx.x;
int tIdx = threadIdx.y * blockDim.x + threadIdx.x;
int rowCount = permutationShape[0];
int colCount = permutationShape[1];
int i = (bIdx * rowCount * colCount) + (tIdx * colCount);
c += permutations[i];
b += permutations[i+1];
__syncthreads();
results[bIdx] = b / (b + c);
}
""")
calc = kernel.get_function('calc')
# prepare results array
d_results = gpuarray.zeros((6,1), np.float32)
d_results = gpuarray.to_gpu(d_results)
h_perms_shape = np.array([ROWS,COLS], np.int32);
d_perms_shape = gpuarray.to_gpu(h_perms_shape);
start = cuda.Event()
end = cuda.Event()
start.record()
calc(d_perms, d_perms_shape, d_results, block=(ROWS,1,1), grid=(ROWS*COLS,1,1))
end.record()
secs = start.time_till(end)*1e-3
print(secs)
print(d_results)
我希望得到这个-
array([[0.5 ],
[0.775],
[0.6666667],
[0.775],
[0.6666667],
[0.5 ]], dtype=float32)
但是我明白了-
array([[0.5 ],
[0.7777778],
[0.6666667],
[0.7777778],
[0.6666667],
[0.5 ]], dtype=float32)
我试图理解为什么 (7+11+13)/((7+11+13)+(2+3+4))
的特定计算结果不是 0.775
您发布的代码在此处包含内存竞争:
int i = (bIdx * rowCount * colCount) + (tIdx * colCount);
c += permutations[i];
b += permutations[i+1];
因为 b
和 c
在共享内存中,您将有多个线程尝试同时读写 from/to 相同的内存位置,这在 CUDA 中是未定义的行为(除非在此处不适用的极其特殊的条件下)。
如果我把它写成一个玩具示例,我可能会这样做:
__global__
void calc(float *permutations, int *permutationShape, float *results)
{
__shared__ float c;
__shared__ float b;
int bIdx = blockIdx.y * gridDim.x + blockIdx.x;
int tIdx = threadIdx.y * blockDim.x + threadIdx.x;
int rowCount = permutationShape[0];
int colCount = permutationShape[1];
int i = (bIdx * rowCount * colCount) + (tIdx * colCount);
atomicAdd(&c, permutations[i]);
atomicAdd(&b, permutations[i+1]);
__syncthreads();
if (tIdx == 0) {
results[bIdx] = b / (b + c);;
}
}
在此代码中,atomicAdd
确保加法和内存事务按顺序发生,从而避免内存竞争。从性能的角度来看,这对于不太琐碎的示例来说不是一个好的解决方案(查看共享内存减少技术),但它应该按预期工作。
我正在尝试实现一个计算百分比平均值的内核。
示例-取 3D 数组(在下面的代码中)片 [[2,4],[3,6],[4,8]]
并计算 (4+6+8)/((4+6+8)+(2+3+4))
这是一个 colab notebook,可以快速 运行 以下代码:https://colab.research.google.com/drive/1k_XfOVOYWOTnNQFA9Vo_H93D9l-xWO8K?usp=sharing
# -*- coding: utf-8 -*-
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
# set dimentions
ROWS = 3
COLS = 2
h_perms = np.array([[
[ 1,1],
[ 1,1],
[ 1,1]
],[
[ 2,7],
[ 3,11],
[ 4,13]
],[
[ 2,4],
[ 3,6],
[ 4,8]
],[
[ 2,7],
[ 3,11],
[ 4,13]
],[
[ 2,4],
[ 3,6],
[ 4,8]
],[
[ 1,1],
[ 1,1],
[ 1,1]
]
], dtype=np.float32).flatten()
# send to device
d_perms = gpuarray.to_gpu(h_perms)
kernel = SourceModule("""
__global__
void calc(float *permutations, int *permutationShape, float *results)
{
__shared__ float c;
__shared__ float b;
int bIdx = blockIdx.y * gridDim.x + blockIdx.x;
int tIdx = threadIdx.y * blockDim.x + threadIdx.x;
int rowCount = permutationShape[0];
int colCount = permutationShape[1];
int i = (bIdx * rowCount * colCount) + (tIdx * colCount);
c += permutations[i];
b += permutations[i+1];
__syncthreads();
results[bIdx] = b / (b + c);
}
""")
calc = kernel.get_function('calc')
# prepare results array
d_results = gpuarray.zeros((6,1), np.float32)
d_results = gpuarray.to_gpu(d_results)
h_perms_shape = np.array([ROWS,COLS], np.int32);
d_perms_shape = gpuarray.to_gpu(h_perms_shape);
start = cuda.Event()
end = cuda.Event()
start.record()
calc(d_perms, d_perms_shape, d_results, block=(ROWS,1,1), grid=(ROWS*COLS,1,1))
end.record()
secs = start.time_till(end)*1e-3
print(secs)
print(d_results)
我希望得到这个-
array([[0.5 ],
[0.775],
[0.6666667],
[0.775],
[0.6666667],
[0.5 ]], dtype=float32)
但是我明白了-
array([[0.5 ],
[0.7777778],
[0.6666667],
[0.7777778],
[0.6666667],
[0.5 ]], dtype=float32)
我试图理解为什么 (7+11+13)/((7+11+13)+(2+3+4))
的特定计算结果不是 0.775
您发布的代码在此处包含内存竞争:
int i = (bIdx * rowCount * colCount) + (tIdx * colCount);
c += permutations[i];
b += permutations[i+1];
因为 b
和 c
在共享内存中,您将有多个线程尝试同时读写 from/to 相同的内存位置,这在 CUDA 中是未定义的行为(除非在此处不适用的极其特殊的条件下)。
如果我把它写成一个玩具示例,我可能会这样做:
__global__
void calc(float *permutations, int *permutationShape, float *results)
{
__shared__ float c;
__shared__ float b;
int bIdx = blockIdx.y * gridDim.x + blockIdx.x;
int tIdx = threadIdx.y * blockDim.x + threadIdx.x;
int rowCount = permutationShape[0];
int colCount = permutationShape[1];
int i = (bIdx * rowCount * colCount) + (tIdx * colCount);
atomicAdd(&c, permutations[i]);
atomicAdd(&b, permutations[i+1]);
__syncthreads();
if (tIdx == 0) {
results[bIdx] = b / (b + c);;
}
}
在此代码中,atomicAdd
确保加法和内存事务按顺序发生,从而避免内存竞争。从性能的角度来看,这对于不太琐碎的示例来说不是一个好的解决方案(查看共享内存减少技术),但它应该按预期工作。