OpenCL:不支持 64 位 global_id() 吗?
OpenCL: Is 64 bit global_id() not supported?
我是 OpenCL 新手,我不能 return 来自已编译内核的 64 位值。我做错了什么?
我有一个 Intel(R) HD Graphics 520
显卡,我想编写一个处理 64 位值的算法。但是当全局 id 超过 4e12(更准确地说是 2^32-1)时,它似乎溢出了。似乎一切都是在 x64 中构建的。我用 Visual Studio 2019 编译,目标:x64。我已经安装了最新的 Intel 图形设备驱动程序 (30.0.101.1660
)。它可以构建内核并且可以正常工作,除了它使用的是 32 位而不是 64 位!
谁能帮我看看我做错了什么?
这是我的代码。抱歉,有点长……我已经尽量短了。我知道,它有一些小问题(例如,没有使用原子写入)但这只是一个 POC 代码,并不像我预期的那样有效。 :(
#include <cstdio>
#include <cassert>
#include <iostream>
using namespace std;
#include <CL/opencl.h>
int runCL(const cl_ulong n) {
cl_int err = 0;
cl_uint num_platforms;
cl_platform_id platforms[16]; // Can be on stack!
err = clGetPlatformIDs(16, platforms, &num_platforms);
assert(err == 0);
assert(num_platforms);
cl_uint num_devices;
cl_device_id devices[16]; // Can be on stack!
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 16, devices, &num_devices);
assert(err == 0);
assert(num_devices);
#define PR_DEV_INFO(name, type) invoke([devices]()->type { type wrk; \
cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(wrk), (void*)&wrk, NULL);\
assert(err == 0); cout << #name << ": " << wrk << endl; return wrk;})
#define PR_DEV_INFO_CHAR(name) invoke([devices]()->string { size_t size; \
cl_uint err = clGetDeviceInfo(devices[0], name, 0, NULL, &size);\
assert(err == 0); char* wrk = new char[size];\
err = clGetDeviceInfo(devices[0], name, size, (void*)wrk, NULL);\
assert(err == 0); string s(wrk); delete[] wrk;\
cout << #name << " [" << size << "]: " << s << endl; return s;})
#define PR_DEV_INFO_ARR(name, type, len) invoke([devices](size_t arr_len)->void { \
type *wrk = new type[arr_len]; \
cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(type)*arr_len, (void*)wrk, NULL);\
assert(err == 0); cout << #name << ":";\
for(int i=0; i<arr_len;++i) cout << ' ' << wrk[i]; cout << endl; delete[] wrk;}, len)
PR_DEV_INFO_CHAR(CL_DEVICE_NAME);
PR_DEV_INFO_CHAR(CL_DEVICE_VERSION);
PR_DEV_INFO_CHAR(CL_DRIVER_VERSION);
PR_DEV_INFO_CHAR(CL_DEVICE_EXTENSIONS);
PR_DEV_INFO(CL_DEVICE_ADDRESS_BITS, cl_uint);
PR_DEV_INFO(CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
const size_t max_item_dim =
PR_DEV_INFO(CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
cl_uint dims = PR_DEV_INFO(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint);
PR_DEV_INFO_ARR(CL_DEVICE_MAX_WORK_ITEM_SIZES, size_t, dims);
cl_context context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err);
assert(err == 0);
string kernel_txt(
"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cles_khr_int64 : enable\n"
"__kernel void render(__global ulong * out) {\n"
" size_t gid = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" ulong val = out[lid];\n"
" out[lid] = val < gid ? gid : val;\n" // Not atomic!
" if (lid == 255) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong);\n"
"}\n");
const char* kernel_mem = kernel_txt.c_str();
// kernel_mem cannot be on stack
cl_program program = clCreateProgramWithSource(context, 1, &kernel_mem, NULL, &err);
assert(err == 0);
//https://www.khronos.org/registry/OpenCL/specs/2.2/html/OpenCL_API.html#compiler-options
const char* options = "-w -Werror -cl-std=CL3.0";
err = clBuildProgram(program, num_devices, devices, options, NULL, NULL);
if (err) {
cerr << "Build error: " << err << endl;
size_t size = 0;
// Just get log size first, then read it again to the proper log
cl_int err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
char* plog = new char[size];
err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, size, plog, &size);
cerr << "Build log (size: " << size << "): '" << plog << "' [err:" << err2 << "d]" << endl;
delete[] plog;
exit(1);
}
cl_kernel kernel = clCreateKernel(program, "render", &err);
assert(err == 0);
cl_ulong* host_image = new cl_ulong[max_item_dim](); // cannot be on stack!
size_t buffer_size = sizeof(cl_ulong) * max_item_dim;
cl_mem image = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, &err);
assert(err == 0);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image);
assert(err == 0);
cl_command_queue cmd_queue = clCreateCommandQueueWithProperties(context, devices[0], NULL, &err);
assert(err == 0);
size_t dev_wrk_size[1] = { n };
size_t dev_wrk_offs[1] = { 0 };
size_t loc_wrk_size[1] = { (size_t)max_item_dim };
// https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, dev_wrk_offs, dev_wrk_size, loc_wrk_size, 0, NULL, NULL);
assert(err == 0);
// Non-blocking read, so we can continue queuing up more kernels
err = clEnqueueReadBuffer(cmd_queue, image, CL_FALSE, 0, buffer_size, host_image, 0, NULL, NULL);
assert(err == 0);
err = clFinish(cmd_queue);
assert(err == 0);
for (int i = 0; i < 256; ++i) cout << '[' << i << ':' << host_image[i] << "]";
cout << '{' << n << '}' << endl;
for (int i = 0; i < 256; ++i) printf("[%d:%zd]", i, host_image[i]);
printf("{%zd}\nsize_t:%zd, cl_ulong:%zd\n", n, sizeof(size_t), sizeof(cl_ulong));
clReleaseMemObject(image);
clReleaseKernel(kernel);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
delete[] host_image;
return CL_SUCCESS;
}
int main() {
runCL(10'000'000'000ULL);
return 0;
}
在内核中,最后一个 returned 项 (out[255]
) 包含 ulong
和 size_t
的大小组合为 8008,这似乎是可以的都是 8 字节长。
以及输出(我把重复的行删掉了):
CL_DEVICE_NAME [25]: Intel(R) HD Graphics 520
CL_DEVICE_VERSION [16]: OpenCL 3.0 NEO
CL_DRIVER_VERSION [14]: 30.0.101.1660
CL_DEVICE_EXTENSIONS [1654]: cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_command_queue_families cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_driver_diagnostics cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_intel_subgroups_char cl_intel_subgroups_long cl_khr_il_program cl_intel_mem_force_host_memory cl_khr_subgroup_extended_types cl_khr_subgroup_non_uniform_vote cl_khr_subgroup_ballot cl_khr_subgroup_non_uniform_arithmetic cl_khr_subgroup_shuffle cl_khr_subgroup_shuffle_relative cl_khr_subgroup_clustered_reduce cl_intel_device_attribute_query cl_khr_suggested_local_work_size cl_khr_fp64 cl_khr_subgroups cl_intel_spirv_device_side_avc_motion_estimation cl_intel_spirv_media_block_io cl_intel_spirv_subgroups cl_khr_spirv_no_integer_wrap_decoration cl_intel_unified_shared_memory_preview cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_advanced_motion_estimation cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_image2d_from_buffer cl_khr_depth_images cl_khr_3d_image_writes cl_intel_media_block_io cl_khr_gl_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_intel_dx9_media_sharing cl_khr_dx9_media_sharing cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_intel_d3d11_nv12_media_sharing cl_intel_sharing_format_query cl_khr_pci_bus_info cl_intel_simultaneous_sharing
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 24
CL_DEVICE_MAX_WORK_GROUP_SIZE: 256
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 256 256 256
[0:4294965760][1:4294965761]<...>[253:4294966781][254:4294966782][255:8008]{10000000000}
[0:4294965760][1:4294965761]<...>[253:4294966781][254:4294966782][255:8008]{10000000000}
size_t:8, cl_ulong:8
我用 cout
和 printf
(%zd) 打印了结果,以确保不是 cout
导致问题。 :)
我是否应该在 clBuildProgram
或内核代码 (#pragma
) 中启用某些功能才能在内核端使用 64 位?
更新
我在内核代码中稍微修改了一下,统计了global_id(0)
的位数,好像一直是32,不是上面!
" int i = 0; for(; i<64 && gid;++i, gid>>=1);"
" out[lid] = val < i ? i : val;\n" // Not atomic!
所以,global_id(0) return似乎是一个 32 位值!
更新2
我将内核代码修改为size_t gid = get_local_id(0)+get_local_size(0)*get_group_id(0);
而不是size_t gid = get_global_id(0);
。
结果变为:
[0:9999999744][1:9999999745]<...>[253:9999998973][254:9999998974][255:8008]{10000000000}
size_t:8, cl_ulong:8
哪个看起来好多了!
我还做了一个测试来避免竞争条件,使用原子比较和交换更迂腐:
" size_t gid = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" //out[lid] = gid;\n" // Not atomic!
" ulong val_new, val_org = out[lid];\n"
" do {\n"
" val_new = val_org > gid ? val_org : gid;\n"
" } while (!atomic_compare_exchange_strong(out + lid, &val_org, val_new));\n"
结果相同(差):
[0:4294967040][1:4294967041]<...>[253:4294967293][254:4294967294][255:8008]{10000000000}
简而言之:支持64位寻址,如CL_DEVICE_ADDRESS_BITS: 64
所示。通常所有 OpenCL 设备都支持 64 位整数(C++ 中的unsigned long long int
,OpenCL C 中的ulong
)。 Intel HD 520甚至支持FP64双精度。
问题是您的内核中存在竞争条件,因为您没有使用原子。许多线程同时尝试写入 out[lid]
,哪个线程获胜是完全随机的。
这是 Nvidia GPU 和 Intel GPU 的输出。对于 Intel GPU,每次执行的行为都是随机的,但我偶尔会得到大于 4294966784
.
的值
CL_DEVICE_NAME [24]: NVIDIA GeForce GTX 960M
CL_DEVICE_VERSION [16]: OpenCL 3.0 CUDA
CL_DRIVER_VERSION [7]: 511.79
CL_DEVICE_EXTENSIONS [606]: cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing cl_nv_copy_opts cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_device_uuid cl_khr_pci_bus_info cl_khr_external_semaphore cl_khr_external_memory cl_khr_external_semaphore_win32 cl_khr_external_memory_win32
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 5
CL_DEVICE_MAX_WORK_GROUP_SIZE: 1024
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 1024 1024 64
[0:18446744073709546496][1:18446744073709546497]...[254:18446744073709549822][255:8008]{10000000000}
[0:-5120][1:-5119]...[254:-1794][255:8008]{10000000000}
size_t:8, cl_ulong:8
CL_DEVICE_NAME [26]: Intel(R) HD Graphics 4600
CL_DEVICE_VERSION [12]: OpenCL 1.2
CL_DRIVER_VERSION [14]: 20.19.15.4624
CL_DEVICE_EXTENSIONS [616]: cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_ctz cl_intel_d3d11_nv12_media_sharing cl_intel_dx9_media_sharing cl_intel_motion_estimation cl_intel_simultaneous_sharing cl_intel_subgroups cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_khr_depth_images cl_khr_dx9_media_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_gl_sharing cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_spir
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 20
CL_DEVICE_MAX_WORK_GROUP_SIZE: 512
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 512 512 512
[0:4294966784][1:2154343490416]...[254:4294966526][255:8008]{10000000000}
[0:4294966784][1:2154343490416]...[254:4294966526][255:8008]{10000000000}
size_t:8, cl_ulong:8
为了简化 OpenCL 开发,考虑这个 OpenCL-Wrapper。有了这个,您的代码(不修复竞争条件错误)明显更短且更具可读性:
int main() {
const ulong N = 10000000000ull;
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
Memory<ulong> image(device, 64u); // allocate memory on both host and device
Kernel kernel(device, N, "render", image); // kernel that runs on the device
kernel.run(); // run add_kernel on the device
image.read_from_device(); // copy data from device memory to host memory
for(int i=0; i<256; i++) print("["+to_string(i)+":"+to_string(image[i])+"]");
println("{"+to_string(N)+"}");
println("size_t:"+to_string(sizeof(size_t))+", cl_ulong:"+to_string(sizeof(cl_ulong)));
wait();
return 0;
}
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
__kernel void render(__global ulong * out) {
size_t gid = get_global_id(0);
size_t lid = get_local_id(0);
ulong val = out[lid];
out[lid] = val < gid ? gid : val; // RACE CONDITION here
if (lid == 255) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong); // another race condition here
}
);} // ############################################################### end of OpenCL C code #####################################################################
我已将此问题报告给英特尔。他们回答>here<。简而言之答案:
some of our hardware counters that feed into the global ID calculation are limited to 32 bits, specifically the work-group ID.
...
解决方法,如果全局大小可以被局部大小整除。
// If you need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) +
get_global_offset(0) + get_local_id(0);
// If you do not need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) +
get_local_id(0);
如果不可整除,则代替get_local_size use get_enqueued_local_size。
我是 OpenCL 新手,我不能 return 来自已编译内核的 64 位值。我做错了什么?
我有一个 Intel(R) HD Graphics 520
显卡,我想编写一个处理 64 位值的算法。但是当全局 id 超过 4e12(更准确地说是 2^32-1)时,它似乎溢出了。似乎一切都是在 x64 中构建的。我用 Visual Studio 2019 编译,目标:x64。我已经安装了最新的 Intel 图形设备驱动程序 (30.0.101.1660
)。它可以构建内核并且可以正常工作,除了它使用的是 32 位而不是 64 位!
谁能帮我看看我做错了什么?
这是我的代码。抱歉,有点长……我已经尽量短了。我知道,它有一些小问题(例如,没有使用原子写入)但这只是一个 POC 代码,并不像我预期的那样有效。 :(
#include <cstdio>
#include <cassert>
#include <iostream>
using namespace std;
#include <CL/opencl.h>
int runCL(const cl_ulong n) {
cl_int err = 0;
cl_uint num_platforms;
cl_platform_id platforms[16]; // Can be on stack!
err = clGetPlatformIDs(16, platforms, &num_platforms);
assert(err == 0);
assert(num_platforms);
cl_uint num_devices;
cl_device_id devices[16]; // Can be on stack!
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 16, devices, &num_devices);
assert(err == 0);
assert(num_devices);
#define PR_DEV_INFO(name, type) invoke([devices]()->type { type wrk; \
cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(wrk), (void*)&wrk, NULL);\
assert(err == 0); cout << #name << ": " << wrk << endl; return wrk;})
#define PR_DEV_INFO_CHAR(name) invoke([devices]()->string { size_t size; \
cl_uint err = clGetDeviceInfo(devices[0], name, 0, NULL, &size);\
assert(err == 0); char* wrk = new char[size];\
err = clGetDeviceInfo(devices[0], name, size, (void*)wrk, NULL);\
assert(err == 0); string s(wrk); delete[] wrk;\
cout << #name << " [" << size << "]: " << s << endl; return s;})
#define PR_DEV_INFO_ARR(name, type, len) invoke([devices](size_t arr_len)->void { \
type *wrk = new type[arr_len]; \
cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(type)*arr_len, (void*)wrk, NULL);\
assert(err == 0); cout << #name << ":";\
for(int i=0; i<arr_len;++i) cout << ' ' << wrk[i]; cout << endl; delete[] wrk;}, len)
PR_DEV_INFO_CHAR(CL_DEVICE_NAME);
PR_DEV_INFO_CHAR(CL_DEVICE_VERSION);
PR_DEV_INFO_CHAR(CL_DRIVER_VERSION);
PR_DEV_INFO_CHAR(CL_DEVICE_EXTENSIONS);
PR_DEV_INFO(CL_DEVICE_ADDRESS_BITS, cl_uint);
PR_DEV_INFO(CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
const size_t max_item_dim =
PR_DEV_INFO(CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
cl_uint dims = PR_DEV_INFO(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint);
PR_DEV_INFO_ARR(CL_DEVICE_MAX_WORK_ITEM_SIZES, size_t, dims);
cl_context context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err);
assert(err == 0);
string kernel_txt(
"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cles_khr_int64 : enable\n"
"__kernel void render(__global ulong * out) {\n"
" size_t gid = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" ulong val = out[lid];\n"
" out[lid] = val < gid ? gid : val;\n" // Not atomic!
" if (lid == 255) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong);\n"
"}\n");
const char* kernel_mem = kernel_txt.c_str();
// kernel_mem cannot be on stack
cl_program program = clCreateProgramWithSource(context, 1, &kernel_mem, NULL, &err);
assert(err == 0);
//https://www.khronos.org/registry/OpenCL/specs/2.2/html/OpenCL_API.html#compiler-options
const char* options = "-w -Werror -cl-std=CL3.0";
err = clBuildProgram(program, num_devices, devices, options, NULL, NULL);
if (err) {
cerr << "Build error: " << err << endl;
size_t size = 0;
// Just get log size first, then read it again to the proper log
cl_int err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
char* plog = new char[size];
err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, size, plog, &size);
cerr << "Build log (size: " << size << "): '" << plog << "' [err:" << err2 << "d]" << endl;
delete[] plog;
exit(1);
}
cl_kernel kernel = clCreateKernel(program, "render", &err);
assert(err == 0);
cl_ulong* host_image = new cl_ulong[max_item_dim](); // cannot be on stack!
size_t buffer_size = sizeof(cl_ulong) * max_item_dim;
cl_mem image = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, &err);
assert(err == 0);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image);
assert(err == 0);
cl_command_queue cmd_queue = clCreateCommandQueueWithProperties(context, devices[0], NULL, &err);
assert(err == 0);
size_t dev_wrk_size[1] = { n };
size_t dev_wrk_offs[1] = { 0 };
size_t loc_wrk_size[1] = { (size_t)max_item_dim };
// https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, dev_wrk_offs, dev_wrk_size, loc_wrk_size, 0, NULL, NULL);
assert(err == 0);
// Non-blocking read, so we can continue queuing up more kernels
err = clEnqueueReadBuffer(cmd_queue, image, CL_FALSE, 0, buffer_size, host_image, 0, NULL, NULL);
assert(err == 0);
err = clFinish(cmd_queue);
assert(err == 0);
for (int i = 0; i < 256; ++i) cout << '[' << i << ':' << host_image[i] << "]";
cout << '{' << n << '}' << endl;
for (int i = 0; i < 256; ++i) printf("[%d:%zd]", i, host_image[i]);
printf("{%zd}\nsize_t:%zd, cl_ulong:%zd\n", n, sizeof(size_t), sizeof(cl_ulong));
clReleaseMemObject(image);
clReleaseKernel(kernel);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
delete[] host_image;
return CL_SUCCESS;
}
int main() {
runCL(10'000'000'000ULL);
return 0;
}
在内核中,最后一个 returned 项 (out[255]
) 包含 ulong
和 size_t
的大小组合为 8008,这似乎是可以的都是 8 字节长。
以及输出(我把重复的行删掉了):
CL_DEVICE_NAME [25]: Intel(R) HD Graphics 520
CL_DEVICE_VERSION [16]: OpenCL 3.0 NEO
CL_DRIVER_VERSION [14]: 30.0.101.1660
CL_DEVICE_EXTENSIONS [1654]: cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_command_queue_families cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_driver_diagnostics cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_intel_subgroups_char cl_intel_subgroups_long cl_khr_il_program cl_intel_mem_force_host_memory cl_khr_subgroup_extended_types cl_khr_subgroup_non_uniform_vote cl_khr_subgroup_ballot cl_khr_subgroup_non_uniform_arithmetic cl_khr_subgroup_shuffle cl_khr_subgroup_shuffle_relative cl_khr_subgroup_clustered_reduce cl_intel_device_attribute_query cl_khr_suggested_local_work_size cl_khr_fp64 cl_khr_subgroups cl_intel_spirv_device_side_avc_motion_estimation cl_intel_spirv_media_block_io cl_intel_spirv_subgroups cl_khr_spirv_no_integer_wrap_decoration cl_intel_unified_shared_memory_preview cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_advanced_motion_estimation cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_image2d_from_buffer cl_khr_depth_images cl_khr_3d_image_writes cl_intel_media_block_io cl_khr_gl_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_intel_dx9_media_sharing cl_khr_dx9_media_sharing cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_intel_d3d11_nv12_media_sharing cl_intel_sharing_format_query cl_khr_pci_bus_info cl_intel_simultaneous_sharing
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 24
CL_DEVICE_MAX_WORK_GROUP_SIZE: 256
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 256 256 256
[0:4294965760][1:4294965761]<...>[253:4294966781][254:4294966782][255:8008]{10000000000}
[0:4294965760][1:4294965761]<...>[253:4294966781][254:4294966782][255:8008]{10000000000}
size_t:8, cl_ulong:8
我用 cout
和 printf
(%zd) 打印了结果,以确保不是 cout
导致问题。 :)
我是否应该在 clBuildProgram
或内核代码 (#pragma
) 中启用某些功能才能在内核端使用 64 位?
更新
我在内核代码中稍微修改了一下,统计了global_id(0)
的位数,好像一直是32,不是上面!
" int i = 0; for(; i<64 && gid;++i, gid>>=1);"
" out[lid] = val < i ? i : val;\n" // Not atomic!
所以,global_id(0) return似乎是一个 32 位值!
更新2
我将内核代码修改为size_t gid = get_local_id(0)+get_local_size(0)*get_group_id(0);
而不是size_t gid = get_global_id(0);
。
结果变为:
[0:9999999744][1:9999999745]<...>[253:9999998973][254:9999998974][255:8008]{10000000000}
size_t:8, cl_ulong:8
哪个看起来好多了!
我还做了一个测试来避免竞争条件,使用原子比较和交换更迂腐:
" size_t gid = get_global_id(0);\n"
" size_t lid = get_local_id(0);\n"
" //out[lid] = gid;\n" // Not atomic!
" ulong val_new, val_org = out[lid];\n"
" do {\n"
" val_new = val_org > gid ? val_org : gid;\n"
" } while (!atomic_compare_exchange_strong(out + lid, &val_org, val_new));\n"
结果相同(差):
[0:4294967040][1:4294967041]<...>[253:4294967293][254:4294967294][255:8008]{10000000000}
简而言之:支持64位寻址,如CL_DEVICE_ADDRESS_BITS: 64
所示。通常所有 OpenCL 设备都支持 64 位整数(C++ 中的unsigned long long int
,OpenCL C 中的ulong
)。 Intel HD 520甚至支持FP64双精度。
问题是您的内核中存在竞争条件,因为您没有使用原子。许多线程同时尝试写入 out[lid]
,哪个线程获胜是完全随机的。
这是 Nvidia GPU 和 Intel GPU 的输出。对于 Intel GPU,每次执行的行为都是随机的,但我偶尔会得到大于 4294966784
.
CL_DEVICE_NAME [24]: NVIDIA GeForce GTX 960M
CL_DEVICE_VERSION [16]: OpenCL 3.0 CUDA
CL_DRIVER_VERSION [7]: 511.79
CL_DEVICE_EXTENSIONS [606]: cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing cl_nv_copy_opts cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_device_uuid cl_khr_pci_bus_info cl_khr_external_semaphore cl_khr_external_memory cl_khr_external_semaphore_win32 cl_khr_external_memory_win32
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 5
CL_DEVICE_MAX_WORK_GROUP_SIZE: 1024
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 1024 1024 64
[0:18446744073709546496][1:18446744073709546497]...[254:18446744073709549822][255:8008]{10000000000}
[0:-5120][1:-5119]...[254:-1794][255:8008]{10000000000}
size_t:8, cl_ulong:8
CL_DEVICE_NAME [26]: Intel(R) HD Graphics 4600
CL_DEVICE_VERSION [12]: OpenCL 1.2
CL_DRIVER_VERSION [14]: 20.19.15.4624
CL_DEVICE_EXTENSIONS [616]: cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_ctz cl_intel_d3d11_nv12_media_sharing cl_intel_dx9_media_sharing cl_intel_motion_estimation cl_intel_simultaneous_sharing cl_intel_subgroups cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_khr_depth_images cl_khr_dx9_media_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_gl_sharing cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_spir
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 20
CL_DEVICE_MAX_WORK_GROUP_SIZE: 512
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 512 512 512
[0:4294966784][1:2154343490416]...[254:4294966526][255:8008]{10000000000}
[0:4294966784][1:2154343490416]...[254:4294966526][255:8008]{10000000000}
size_t:8, cl_ulong:8
为了简化 OpenCL 开发,考虑这个 OpenCL-Wrapper。有了这个,您的代码(不修复竞争条件错误)明显更短且更具可读性:
int main() {
const ulong N = 10000000000ull;
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
Memory<ulong> image(device, 64u); // allocate memory on both host and device
Kernel kernel(device, N, "render", image); // kernel that runs on the device
kernel.run(); // run add_kernel on the device
image.read_from_device(); // copy data from device memory to host memory
for(int i=0; i<256; i++) print("["+to_string(i)+":"+to_string(image[i])+"]");
println("{"+to_string(N)+"}");
println("size_t:"+to_string(sizeof(size_t))+", cl_ulong:"+to_string(sizeof(cl_ulong)));
wait();
return 0;
}
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
__kernel void render(__global ulong * out) {
size_t gid = get_global_id(0);
size_t lid = get_local_id(0);
ulong val = out[lid];
out[lid] = val < gid ? gid : val; // RACE CONDITION here
if (lid == 255) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong); // another race condition here
}
);} // ############################################################### end of OpenCL C code #####################################################################
我已将此问题报告给英特尔。他们回答>here<。简而言之答案:
some of our hardware counters that feed into the global ID calculation are limited to 32 bits, specifically the work-group ID. ...
解决方法,如果全局大小可以被局部大小整除。
// If you need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) +
get_global_offset(0) + get_local_id(0);
// If you do not need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) +
get_local_id(0);
如果不可整除,则代替get_local_size use get_enqueued_local_size。