pyopencl.LogicError: clEnqueueNDRangeKernel failed: invalid work item size
pyopencl.LogicError: clEnqueueNDRangeKernel failed: invalid work item size
我正尝试在 Python 中使用 pyopencl 实现 dot_persist_kernel() here,并且我一直在解决许多错误。但是,我偶然发现了一个我无法破解的问题:
self.program = cl.Program(self.ctx, code).build()
# code is a string with the code from the link given
a = cl_array.to_device(self.queue, np.random.rand(2**20).astype(np.float32))
b = cl_array.to_device(self.queue, np.random.rand(2**20).astype(np.float32))
c = 0.
mf = cl.mem_flags
c_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, 4)
MAX_COMPUTE_UNITS = cl.get_platforms()[0].get_devices()[0].max_compute_units
WORK_GROUPS_PER_CU = MAX_COMPUTE_UNITS * 4
ELEMENTS_PER_GROUP = a.size / WORK_GROUPS_PER_CU
ELEMENTS_PER_WORK_ITEM = ELEMENTS_PER_GROUP / 256
self.program.DotProduct(self.queue, a.shape, a.shape,
a.data, b.data, c_buf,
np.uint32(ELEMENTS_PER_GROUP),
np.uint32(ELEMENTS_PER_WORK_ITEM),
np.uint32(1028 * MAX_COMPUTE_UNITS))
假设一个大小为 2^26 的数组,常量的值为:
MAX_COMPUTE_UNITS = 32 // from get_device()[0].max_compute_units
WORK_GROUPS_PER_CU = 128 // MAX_COMPUTE_UNITS * 4
ELEMENTS_PER_GROUP = 524288 // 2^19
ELEMENTS_PER_WORK_ITEM = 2048 // 2^11
内核头文件如下:
#define LOCAL_GROUP_XDIM 256
// Kernel for part 1 of dot product, version 3.
__kernel __attribute__((reqd_work_group_size(LOCAL_GROUP_XDIM, 1, 1)))
void dot_persist_kernel(
__global const double * x, // input vector
__global const double * y, // input vector
__global double * r, // result vector
uint n_per_group, // elements processed per group
uint n_per_work_item, // elements processed per work item
uint n // input vector size
)
它给出的错误是:
Traceback (most recent call last):
File "GPUCompute.py", line 102, in <module>
gpu = GPUCompute()
File "GPUCompute.py", line 87, in __init__
np.uint32(1028 * MAX_COMPUTE_UNITS))
File "C:\Miniconda2\lib\site-packages\pyopencl\__init__.py", line 512, in kernel_call
global_offset, wait_for, g_times_l=g_times_l)
pyopencl.LogicError: clEnqueueNDRangeKernel failed: invalid work item size
我试过多次改变数字,但无济于事。想法?
之前的实施存在一些问题,但这个实施有效:
WORK_GROUPS = cl.get_platforms()[0].get_devices()[0].max_compute_units * 4
ELEMENTS_PER_GROUP = np_a.size / WORK_GROUPS
LOCAL_GROUP_XDIM = 256
ELEMENTS_PER_WORK_ITEM = ELEMENTS_PER_GROUP / LOCAL_GROUP_XDIM
self.program = cl.Program(self.ctx, kernel).build()
self.program.DotProduct(
self.queue, np_a.shape, (LOCAL_GROUP_XDIM,), # kernel information
cl_a, cl_b, cl_c, # data
np.uint32(ELEMENTS_PER_GROUP), # elements processed per group
np.uint32(ELEMENTS_PER_WORK_ITEM), # elements processed per work item
np.uint32(np_a.size) # input vector size
)
这是几件事的高潮,但最大的因素是传递给 DotProduct() 的第二个和第三个参数应该是元组——而不是整数,就像我想的那样。 :)
我正尝试在 Python 中使用 pyopencl 实现 dot_persist_kernel() here,并且我一直在解决许多错误。但是,我偶然发现了一个我无法破解的问题:
self.program = cl.Program(self.ctx, code).build()
# code is a string with the code from the link given
a = cl_array.to_device(self.queue, np.random.rand(2**20).astype(np.float32))
b = cl_array.to_device(self.queue, np.random.rand(2**20).astype(np.float32))
c = 0.
mf = cl.mem_flags
c_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, 4)
MAX_COMPUTE_UNITS = cl.get_platforms()[0].get_devices()[0].max_compute_units
WORK_GROUPS_PER_CU = MAX_COMPUTE_UNITS * 4
ELEMENTS_PER_GROUP = a.size / WORK_GROUPS_PER_CU
ELEMENTS_PER_WORK_ITEM = ELEMENTS_PER_GROUP / 256
self.program.DotProduct(self.queue, a.shape, a.shape,
a.data, b.data, c_buf,
np.uint32(ELEMENTS_PER_GROUP),
np.uint32(ELEMENTS_PER_WORK_ITEM),
np.uint32(1028 * MAX_COMPUTE_UNITS))
假设一个大小为 2^26 的数组,常量的值为:
MAX_COMPUTE_UNITS = 32 // from get_device()[0].max_compute_units
WORK_GROUPS_PER_CU = 128 // MAX_COMPUTE_UNITS * 4
ELEMENTS_PER_GROUP = 524288 // 2^19
ELEMENTS_PER_WORK_ITEM = 2048 // 2^11
内核头文件如下:
#define LOCAL_GROUP_XDIM 256
// Kernel for part 1 of dot product, version 3.
__kernel __attribute__((reqd_work_group_size(LOCAL_GROUP_XDIM, 1, 1)))
void dot_persist_kernel(
__global const double * x, // input vector
__global const double * y, // input vector
__global double * r, // result vector
uint n_per_group, // elements processed per group
uint n_per_work_item, // elements processed per work item
uint n // input vector size
)
它给出的错误是:
Traceback (most recent call last):
File "GPUCompute.py", line 102, in <module>
gpu = GPUCompute()
File "GPUCompute.py", line 87, in __init__
np.uint32(1028 * MAX_COMPUTE_UNITS))
File "C:\Miniconda2\lib\site-packages\pyopencl\__init__.py", line 512, in kernel_call
global_offset, wait_for, g_times_l=g_times_l)
pyopencl.LogicError: clEnqueueNDRangeKernel failed: invalid work item size
我试过多次改变数字,但无济于事。想法?
之前的实施存在一些问题,但这个实施有效:
WORK_GROUPS = cl.get_platforms()[0].get_devices()[0].max_compute_units * 4
ELEMENTS_PER_GROUP = np_a.size / WORK_GROUPS
LOCAL_GROUP_XDIM = 256
ELEMENTS_PER_WORK_ITEM = ELEMENTS_PER_GROUP / LOCAL_GROUP_XDIM
self.program = cl.Program(self.ctx, kernel).build()
self.program.DotProduct(
self.queue, np_a.shape, (LOCAL_GROUP_XDIM,), # kernel information
cl_a, cl_b, cl_c, # data
np.uint32(ELEMENTS_PER_GROUP), # elements processed per group
np.uint32(ELEMENTS_PER_WORK_ITEM), # elements processed per work item
np.uint32(np_a.size) # input vector size
)
这是几件事的高潮,但最大的因素是传递给 DotProduct() 的第二个和第三个参数应该是元组——而不是整数,就像我想的那样。 :)