OpenCL bincount
OpenCL bincount
我正在尝试在 OpenCL 中实现一个 bincount 操作,它分配一个输出缓冲区并使用 x 中的索引在同一索引处累积一些权重(假设 num_bins == max(x)
)。这等效于以下 python 代码:
out = np.zeros_like(num_bins)
for i in range(len(x)):
out[x[i]] += weight[i]
return out
import pyopencl as cl
import numpy as np
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
prg = cl.Program(ctx, """
__kernel void bincount(__global int *res_g, __global const int* x_g, __global const int* weight_g)
int gid = get_global_id(0);
res_g[x_g[gid]] += weight_g[gid];
# test
x = np.arange(5, dtype=np.int32).repeat(2) # [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
x_g = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x)
weight = np.arange(10, dtype=np.int32) # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
weight_g = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=weight)
res_g = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * 5)
prg.bincount(queue, [10], None, res_g, x_g, weight_g)
# transfer back to cpu
res_np = np.empty(5).astype(np.int32)
cl.enqueue_copy(queue, res_np, res_g)
array([1, 3, 5, 7, 9], dtype=int32)
array([1, 5, 9, 13, 17], dtype=int32)
以上是一个人为的例子,在我的实际应用程序中 x
将是来自滑动 window 算法的索引:
x = np.array([ 0, 1, 2, 4, 5, 6, 8, 9, 10, 1, 2, 3, 5, 6, 7, 9, 10,
11, 4, 5, 6, 8, 9, 10, 12, 13, 14, 5, 6, 7, 9, 10, 11, 13,
14, 15, 8, 9, 10, 12, 13, 14, 16, 17, 18, 9, 10, 11, 13, 14, 15,
17, 18, 19, 20, 21, 22, 24, 25, 26, 28, 29, 30, 21, 22, 23, 25, 26,
27, 29, 30, 31, 24, 25, 26, 28, 29, 30, 32, 33, 34, 25, 26, 27, 29,
30, 31, 33, 34, 35, 28, 29, 30, 32, 33, 34, 36, 37, 38, 29, 30, 31,
33, 34, 35, 37, 38, 39], dtype=np.int32)
weight = np.array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0], dtype=np.int32)
有一种模式在将 x
重塑为 (2,3,2,3,3)
时变得更加明显。但是我很难弄清楚如何在这里使用@doqtor 给出的方法,特别是如果它足够容易概括的话。
array([1, 1, 0, 0, 2, 2, 0, 0, 3, 3, 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1,
0, 0, 2, 2, 0, 0, 3, 3, 0, 0, 2, 2, 0, 0, 1, 1, 0, 0], dtype=int32)
问题是累积权重的 OpenCL 缓冲区未初始化(归零)。修复:
res_np = np.zeros(5).astype(np.int32)
res_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=res_np)
prg.bincount(queue, [10], None, res_g, x_g, weight_g)
# transfer back to cpu
cl.enqueue_copy(queue, res_np, res_g)
Returns 正确结果:[ 1 5 9 13 17]
正如@Kevin 所注意到的,这里也存在竞争条件。如果有任何模式,它可以在不使用同步的情况下以这种方式解决,例如通过 1 个工作项处理每 2 个元素:
__kernel void bincount(__global int *res_g, __global const int* x_g, __global const int* weight_g)
int gid = get_global_id(0);
for(int x = gid*2; x < gid*2+2; ++x)
res_g[x_g[x]] += weight_g[x];
然后安排 5 个工作项目:
prg.bincount(queue, [5], None, res_g, x_g, weight_g)
我正在尝试在 OpenCL 中实现一个 bincount 操作,它分配一个输出缓冲区并使用 x 中的索引在同一索引处累积一些权重(假设 num_bins == max(x)
)。这等效于以下 python 代码:
out = np.zeros_like(num_bins)
for i in range(len(x)):
out[x[i]] += weight[i]
return out
import pyopencl as cl
import numpy as np
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
prg = cl.Program(ctx, """
__kernel void bincount(__global int *res_g, __global const int* x_g, __global const int* weight_g)
int gid = get_global_id(0);
res_g[x_g[gid]] += weight_g[gid];
# test
x = np.arange(5, dtype=np.int32).repeat(2) # [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
x_g = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x)
weight = np.arange(10, dtype=np.int32) # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
weight_g = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=weight)
res_g = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * 5)
prg.bincount(queue, [10], None, res_g, x_g, weight_g)
# transfer back to cpu
res_np = np.empty(5).astype(np.int32)
cl.enqueue_copy(queue, res_np, res_g)
array([1, 3, 5, 7, 9], dtype=int32)
array([1, 5, 9, 13, 17], dtype=int32)
以上是一个人为的例子,在我的实际应用程序中 x
将是来自滑动 window 算法的索引:
x = np.array([ 0, 1, 2, 4, 5, 6, 8, 9, 10, 1, 2, 3, 5, 6, 7, 9, 10,
11, 4, 5, 6, 8, 9, 10, 12, 13, 14, 5, 6, 7, 9, 10, 11, 13,
14, 15, 8, 9, 10, 12, 13, 14, 16, 17, 18, 9, 10, 11, 13, 14, 15,
17, 18, 19, 20, 21, 22, 24, 25, 26, 28, 29, 30, 21, 22, 23, 25, 26,
27, 29, 30, 31, 24, 25, 26, 28, 29, 30, 32, 33, 34, 25, 26, 27, 29,
30, 31, 33, 34, 35, 28, 29, 30, 32, 33, 34, 36, 37, 38, 29, 30, 31,
33, 34, 35, 37, 38, 39], dtype=np.int32)
weight = np.array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0], dtype=np.int32)
有一种模式在将 x
重塑为 (2,3,2,3,3)
时变得更加明显。但是我很难弄清楚如何在这里使用@doqtor 给出的方法,特别是如果它足够容易概括的话。
array([1, 1, 0, 0, 2, 2, 0, 0, 3, 3, 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1,
0, 0, 2, 2, 0, 0, 3, 3, 0, 0, 2, 2, 0, 0, 1, 1, 0, 0], dtype=int32)
问题是累积权重的 OpenCL 缓冲区未初始化(归零)。修复:
res_np = np.zeros(5).astype(np.int32)
res_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=res_np)
prg.bincount(queue, [10], None, res_g, x_g, weight_g)
# transfer back to cpu
cl.enqueue_copy(queue, res_np, res_g)
Returns 正确结果:[ 1 5 9 13 17]
正如@Kevin 所注意到的,这里也存在竞争条件。如果有任何模式,它可以在不使用同步的情况下以这种方式解决,例如通过 1 个工作项处理每 2 个元素:
__kernel void bincount(__global int *res_g, __global const int* x_g, __global const int* weight_g)
int gid = get_global_id(0);
for(int x = gid*2; x < gid*2+2; ++x)
res_g[x_g[x]] += weight_g[x];
然后安排 5 个工作项目:
prg.bincount(queue, [5], None, res_g, x_g, weight_g)