OpenCL:不支持 64 位 global_id() 吗?

OpenCL: Is 64 bit global_id() not supported?

我是 OpenCL 新手,我不能 return 来自已编译内核的 64 位值。我做错了什么?

我有一个 Intel(R) HD Graphics 520 显卡,我想编写一个处理 64 位值的算法。但是当全局 id 超过 4e12(更准确地说是 2^32-1)时,它似乎溢出了。似乎一切都是在 x64 中构建的。我用 Visual Studio 2019 编译,目标:x64。我已经安装了最新的 Intel 图形设备驱动程序 (30.0.101.1660)。它可以构建内核并且可以正常工作,除了它使用的是 32 位而不是 64 位!

谁能帮我看看我做错了什么?

这是我的代码。抱歉,有点长……我已经尽量短了。我知道,它有一些小问题(例如,没有使用原子写入)但这只是一个 POC 代码,并不像我预期的那样有效。 :(

#include <cstdio>
#include <cassert>
#include <iostream>
using namespace std;

#include <CL/opencl.h>

int runCL(const cl_ulong n) {
    cl_int err = 0;
    cl_uint num_platforms;
    cl_platform_id platforms[16]; // Can be on stack!
    err = clGetPlatformIDs(16, platforms, &num_platforms);
    assert(err == 0);
    assert(num_platforms);

    cl_uint num_devices;
    cl_device_id devices[16]; // Can be on stack!
    err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 16, devices, &num_devices);
    assert(err == 0);
    assert(num_devices);

#define PR_DEV_INFO(name, type) invoke([devices]()->type { type wrk; \
    cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(wrk), (void*)&wrk, NULL);\
    assert(err == 0); cout << #name << ": " << wrk << endl; return wrk;})
#define PR_DEV_INFO_CHAR(name) invoke([devices]()->string { size_t size; \
    cl_uint err = clGetDeviceInfo(devices[0], name, 0, NULL, &size);\
    assert(err == 0); char* wrk = new char[size];\
    err = clGetDeviceInfo(devices[0], name, size, (void*)wrk, NULL);\
    assert(err == 0); string s(wrk); delete[] wrk;\
    cout << #name << " [" << size << "]: " << s << endl; return s;})
#define PR_DEV_INFO_ARR(name, type, len) invoke([devices](size_t arr_len)->void { \
    type *wrk = new type[arr_len]; \
    cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(type)*arr_len, (void*)wrk, NULL);\
    assert(err == 0); cout << #name << ":";\
    for(int i=0; i<arr_len;++i) cout << ' ' << wrk[i]; cout << endl; delete[] wrk;}, len)

    PR_DEV_INFO_CHAR(CL_DEVICE_NAME);
    PR_DEV_INFO_CHAR(CL_DEVICE_VERSION);
    PR_DEV_INFO_CHAR(CL_DRIVER_VERSION);
    PR_DEV_INFO_CHAR(CL_DEVICE_EXTENSIONS);
    PR_DEV_INFO(CL_DEVICE_ADDRESS_BITS, cl_uint);
    PR_DEV_INFO(CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
    const size_t max_item_dim = 
        PR_DEV_INFO(CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
    cl_uint dims = PR_DEV_INFO(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint);
    PR_DEV_INFO_ARR(CL_DEVICE_MAX_WORK_ITEM_SIZES, size_t, dims);

    cl_context context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err);
    assert(err == 0);

    string kernel_txt(
        "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
        "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
        "#pragma OPENCL EXTENSION cles_khr_int64 : enable\n"
        "__kernel void render(__global ulong * out) {\n"
        "  size_t gid = get_global_id(0);\n"
        "  size_t lid = get_local_id(0);\n"
        "  ulong val = out[lid];\n"
        "  out[lid] = val < gid ? gid : val;\n" // Not atomic!
        "  if (lid == 255) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong);\n"
        "}\n");
    const char* kernel_mem = kernel_txt.c_str();
    // kernel_mem cannot be on stack
    cl_program program = clCreateProgramWithSource(context, 1, &kernel_mem, NULL, &err);
    assert(err == 0);

    //https://www.khronos.org/registry/OpenCL/specs/2.2/html/OpenCL_API.html#compiler-options
    const char* options = "-w -Werror -cl-std=CL3.0";
    err = clBuildProgram(program, num_devices, devices, options, NULL, NULL);
    if (err) {
        cerr << "Build error: " << err << endl;
        size_t size = 0;
        // Just get log size first, then read it again to the proper log
        cl_int err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
        char* plog = new char[size];
        err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, size, plog, &size);
        cerr << "Build log (size: " << size << "): '" << plog << "' [err:" << err2 << "d]" << endl;
        delete[] plog;
        exit(1);
    }
    cl_kernel kernel = clCreateKernel(program, "render", &err);
    assert(err == 0);

    cl_ulong* host_image = new cl_ulong[max_item_dim](); // cannot be on stack!
    size_t buffer_size = sizeof(cl_ulong) * max_item_dim;

    cl_mem image = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, &err);
    assert(err == 0);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image);
    assert(err == 0);

    cl_command_queue cmd_queue = clCreateCommandQueueWithProperties(context, devices[0], NULL, &err);
    assert(err == 0);

    size_t dev_wrk_size[1] = { n };
    size_t dev_wrk_offs[1] = { 0 };
    size_t loc_wrk_size[1] = { (size_t)max_item_dim };
    // https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
    err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, dev_wrk_offs, dev_wrk_size, loc_wrk_size, 0, NULL, NULL);
    assert(err == 0);

    // Non-blocking read, so we can continue queuing up more kernels
    err = clEnqueueReadBuffer(cmd_queue, image, CL_FALSE, 0, buffer_size, host_image, 0, NULL, NULL);
    assert(err == 0);

    err = clFinish(cmd_queue);
    assert(err == 0);

    for (int i = 0; i < 256; ++i) cout << '[' << i << ':' << host_image[i] << "]";
    cout << '{' << n << '}' << endl;
    for (int i = 0; i < 256; ++i) printf("[%d:%zd]", i, host_image[i]);
    printf("{%zd}\nsize_t:%zd, cl_ulong:%zd\n", n, sizeof(size_t), sizeof(cl_ulong));

    clReleaseMemObject(image);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
    delete[] host_image;

    return CL_SUCCESS;
}

int main() {
    runCL(10'000'000'000ULL);
    return 0;
}

在内核中,最后一个 returned 项 (out[255]) 包含 ulongsize_t 的大小组合为 8008,这似乎是可以的都是 8 字节长。

以及输出(我把重复的行删掉了):

CL_DEVICE_NAME [25]: Intel(R) HD Graphics 520
CL_DEVICE_VERSION [16]: OpenCL 3.0 NEO
CL_DRIVER_VERSION [14]: 30.0.101.1660
CL_DEVICE_EXTENSIONS [1654]: cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_command_queue_families cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_driver_diagnostics cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_intel_subgroups_char cl_intel_subgroups_long cl_khr_il_program cl_intel_mem_force_host_memory cl_khr_subgroup_extended_types cl_khr_subgroup_non_uniform_vote cl_khr_subgroup_ballot cl_khr_subgroup_non_uniform_arithmetic cl_khr_subgroup_shuffle cl_khr_subgroup_shuffle_relative cl_khr_subgroup_clustered_reduce cl_intel_device_attribute_query cl_khr_suggested_local_work_size cl_khr_fp64 cl_khr_subgroups cl_intel_spirv_device_side_avc_motion_estimation cl_intel_spirv_media_block_io cl_intel_spirv_subgroups cl_khr_spirv_no_integer_wrap_decoration cl_intel_unified_shared_memory_preview cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_advanced_motion_estimation cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_image2d_from_buffer cl_khr_depth_images cl_khr_3d_image_writes cl_intel_media_block_io cl_khr_gl_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_intel_dx9_media_sharing cl_khr_dx9_media_sharing cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_intel_d3d11_nv12_media_sharing cl_intel_sharing_format_query cl_khr_pci_bus_info cl_intel_simultaneous_sharing
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 24
CL_DEVICE_MAX_WORK_GROUP_SIZE: 256
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 256 256 256
[0:4294965760][1:4294965761]<...>[253:4294966781][254:4294966782][255:8008]{10000000000}
[0:4294965760][1:4294965761]<...>[253:4294966781][254:4294966782][255:8008]{10000000000}
size_t:8, cl_ulong:8

我用 coutprintf (%zd) 打印了结果,以确保不是 cout 导致问题。 :)

我是否应该在 clBuildProgram 或内核代码 (#pragma) 中启用某些功能才能在内核端使用 64 位?

更新

我在内核代码中稍微修改了一下,统计了global_id(0)的位数,好像一直是32,不是上面!

"  int i = 0; for(; i<64 && gid;++i, gid>>=1);"
"  out[lid] = val < i ? i : val;\n" // Not atomic!

所以,global_id(0) return似乎是一个 32 位值!

更新2

我将内核代码修改为size_t gid = get_local_id(0)+get_local_size(0)*get_group_id(0);而不是size_t gid = get_global_id(0);

结果变为:

[0:9999999744][1:9999999745]<...>[253:9999998973][254:9999998974][255:8008]{10000000000}
size_t:8, cl_ulong:8

哪个看起来好多了!

我还做了一个测试来避免竞争条件,使用原子比较和交换更迂腐:

"  size_t gid = get_global_id(0);\n"
"  size_t lid = get_local_id(0);\n"
"  //out[lid] = gid;\n" // Not atomic!
"  ulong val_new, val_org = out[lid];\n"
"  do {\n"
"    val_new = val_org > gid ? val_org : gid;\n"
"  } while (!atomic_compare_exchange_strong(out + lid, &val_org, val_new));\n"

结果相同(差):

[0:4294967040][1:4294967041]<...>[253:4294967293][254:4294967294][255:8008]{10000000000}

简而言之:支持64位寻址,如CL_DEVICE_ADDRESS_BITS: 64所示。通常所有 OpenCL 设备都支持 64 位整数(C++ 中的unsigned long long int,OpenCL C 中的ulong)。 Intel HD 520甚至支持FP64双精度。

问题是您的内核中存在竞争条件,因为您没有使用原子。许多线程同时尝试写入 out[lid],哪个线程获胜是完全随机的。

这是 Nvidia GPU 和 Intel GPU 的输出。对于 Intel GPU,每次执行的行为都是随机的,但我偶尔会得到大于 4294966784.

的值
CL_DEVICE_NAME [24]: NVIDIA GeForce GTX 960M
CL_DEVICE_VERSION [16]: OpenCL 3.0 CUDA
CL_DRIVER_VERSION [7]: 511.79
CL_DEVICE_EXTENSIONS [606]: cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing cl_nv_copy_opts cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_device_uuid cl_khr_pci_bus_info cl_khr_external_semaphore cl_khr_external_memory cl_khr_external_semaphore_win32 cl_khr_external_memory_win32
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 5
CL_DEVICE_MAX_WORK_GROUP_SIZE: 1024
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 1024 1024 64
[0:18446744073709546496][1:18446744073709546497]...[254:18446744073709549822][255:8008]{10000000000}
[0:-5120][1:-5119]...[254:-1794][255:8008]{10000000000}
size_t:8, cl_ulong:8
CL_DEVICE_NAME [26]: Intel(R) HD Graphics 4600
CL_DEVICE_VERSION [12]: OpenCL 1.2
CL_DRIVER_VERSION [14]: 20.19.15.4624
CL_DEVICE_EXTENSIONS [616]: cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_ctz cl_intel_d3d11_nv12_media_sharing cl_intel_dx9_media_sharing cl_intel_motion_estimation cl_intel_simultaneous_sharing cl_intel_subgroups cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_khr_depth_images cl_khr_dx9_media_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_gl_sharing cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_spir
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 20
CL_DEVICE_MAX_WORK_GROUP_SIZE: 512
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 512 512 512
[0:4294966784][1:2154343490416]...[254:4294966526][255:8008]{10000000000}
[0:4294966784][1:2154343490416]...[254:4294966526][255:8008]{10000000000}
size_t:8, cl_ulong:8

为了简化 OpenCL 开发,考虑这个 OpenCL-Wrapper。有了这个,您的代码(不修复竞争条件错误)明显更短且更具可读性:

int main() {
    const ulong N = 10000000000ull;

    Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
    Memory<ulong> image(device, 64u); // allocate memory on both host and device
    Kernel kernel(device, N, "render", image); // kernel that runs on the device

    kernel.run(); // run add_kernel on the device
    image.read_from_device(); // copy data from device memory to host memory

    for(int i=0; i<256; i++) print("["+to_string(i)+":"+to_string(image[i])+"]");
    println("{"+to_string(N)+"}");
    println("size_t:"+to_string(sizeof(size_t))+", cl_ulong:"+to_string(sizeof(cl_ulong)));

    wait();
    return 0;
}
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################

__kernel void render(__global ulong * out) {
  size_t gid = get_global_id(0);
  size_t lid = get_local_id(0);
  ulong val = out[lid];
  out[lid] = val < gid ? gid : val; // RACE CONDITION here
  if (lid == 255) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong); // another race condition here
}

);} // ############################################################### end of OpenCL C code #####################################################################

我已将此问题报告给英特尔。他们回答>here<。简而言之答案:

some of our hardware counters that feed into the global ID calculation are limited to 32 bits, specifically the work-group ID. ...

解决方法,如果全局大小可以被局部大小整除。

// If you need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) + 
get_global_offset(0) + get_local_id(0);

// If you do not need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) + 
get_local_id(0);

如果不可整除,则代替get_local_size use get_enqueued_local_size