OpenCL CL_INVALID_VALUE clEnqueueWriteBuffer 错误

Question

我正在尝试在 OpenCL 上制作算法运行。我正在使用 this 存储库 (Source.cpp) 作为模板。我现在想将整个程序转换为 long 算法类型，而不是 float。但是我总是在 CL_INVALID_VALUE (-30) 处遇到异常第二个clEnqueueWriteBuffer。我已经浪费了几个小时没有发现错误，所以也许我已经监督了一些明显的事情（我还没有对 opencl 做太多......）？

我的代码（不工作）

#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>

//#define DATA_SIZE 1024
#define DATA_SIZE 1024

using namespace std;


//$ /f/Tools/OCL_SDK_Light/lib/x86_64/opencl.lib blelloch_scan.cpp
const char* ProgramSource =
"__kernel void add(__global long *input, __global long *output, __global long *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"printf('%d',thid); \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid+1] = input[2*thid+1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid + 1)-1; \n"\
"int bi = offset*(2*thid + 2)-1; \n"\
"temp[bi] += temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; \n"\
"long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; }  \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid+1] = temp[2*thid+1]; \n"\
"}\n"\
"\n";
/*
*/




int main(void)
{
    cl_context context;
    cl_context_properties properties[3];

    cl_command_queue command_queue;
    cl_kernel kernel;
    cl_program program;
    cl_int err;
    cl_uint num_platforms = 0;
    cl_platform_id* platforms;
    cl_device_id device_id;
    cl_uint num_of_devices = 0;
    cl_mem inputA, inputB, output;
    size_t global, loc;
    std::cout << "Setup \n";

    long arr[DATA_SIZE];
    long inputDataA[DATA_SIZE];
    long results[2 * DATA_SIZE];
    long  i;
    for (i = 1; i < DATA_SIZE - 1;i++)
    {
        inputDataA[i-1] = (long)i;
        arr[i-1] = (long)i;
    }
    clock_t ends;

    /* --------------------- Get platform ---------------------*/
    cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
    assert(clResult == CL_SUCCESS);

    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
    clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
    assert(clResult == CL_SUCCESS);
    /* --------------------- ------------ ---------------------*/


    /* --------------------- Get devices ---------------------*/
    cl_device_id* devices = NULL;
    cl_uint num_devices;

    clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    assert(clResult == CL_SUCCESS);

    devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);

    if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
    {
        printf("could not find device id");
    }
    assert(clResult == CL_SUCCESS);
    /* --------------------- ----------- ---------------------*/
    properties[0] = CL_CONTEXT_PLATFORM;
    properties[1] = 0;
    cl_int contextResult;
    context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
    assert(contextResult == CL_SUCCESS);
    // create command queue using the context and device
    command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);

    // create a program from the kernel source code
    program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);

    // compile the program
    if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
    {
        printf("Error building program\n");
        return 1;
    }

    // specify which kernel from the program to execute
    kernel = clCreateKernel(program, "add", &err);

    // create buffers for the input and ouput
    cl_int result;
    inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
    inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);

    // load data into the input buffer
    clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);
    clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE , 0, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS); // ERROR HERE 
    clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, 0, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);


    int temp = DATA_SIZE;

    clock_t start = clock();

    // set the argument list for the kernel command
    clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
    assert(clResult == CL_SUCCESS);


    clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    assert(clResult == CL_SUCCESS);


    clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
    assert(clResult == CL_SUCCESS);


    clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
    assert(clResult == CL_SUCCESS);


    global = DATA_SIZE; // num of processors
    loc = 256;

    printf("\n>> start parallel ---------- \n");
    // enqueue the kernel command for execution
    clResult = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);

    // copy the results from out of the output buffer
    clResult = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, results, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);


    clFinish(command_queue);
    ends = clock();

    // print the results
    int k = 1;
    for (k = 1;k < 8; k++)
    {
        printf("%d - ", k);
        printf("%d \n", results[k]);
    }
    double time_taken = ((double)(ends - start)) / CLK_TCK;
    printf("\n>>finished parallel in %lf seconds\n", time_taken);

    // cleanup - release OpenCL resources

    printf("\n-------------------------------------\n");


    /* -------sequential ------- */
    printf("\n>> start sequential ---------- \n");
    long prefixSum[DATA_SIZE] = { 0 };

    const clock_t startSequential = clock();

    prefixSum[0] = arr[0];
    long idx = 1;

    for (idx = 1; idx < DATA_SIZE; idx++) {
        prefixSum[idx] = prefixSum[idx - 1] + arr[idx];
    }
    const clock_t endSequential = clock();




    double seqTime = ((double)(endSequential - startSequential)) / CLK_TCK;

    printf("\n>> finished sequential in %lf\n", seqTime);
    for (int j = 0;j < 8; j++)
    {
        printf("%d - ", j);
        printf("%d \n", prefixSum[j]);
    }


    clReleaseMemObject(inputA);
    clReleaseMemObject(inputB);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);

    return 0;
}

存储库代码（有效）：

#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>

#include <CL/cl.h>

#define DATA_SIZE 1024
using namespace std;
ofstream outfile;


const char* ProgramSource =
"__kernel void add(__global float *input, __global float *output, __global float *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid+1] = input[2*thid+1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid + 1)-1; \n"\
"int bi = offset*(2*thid + 2)-1; \n"\
"temp[bi] += temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; \n"\
"float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; }  \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid+1] = temp[2*thid+1]; \n"\
"}\n"\
"\n";
/*
*/



int main(void)
{
    cl_uint num_platforms = 0;

    cl_context context;
    cl_context_properties properties[3];
    cl_kernel kernel;
    cl_platform_id* platforms;
    cl_command_queue command_queue;
    cl_program program;
    cl_int err;
    cl_uint num_of_platforms = 0;
    cl_platform_id platform_id;
    cl_device_id device_id;
    cl_uint num_of_devices = 0;
    cl_mem inputA, inputB, output;
    outfile.open("shubham.txt");
    size_t global, loc;

    float inputDataA[DATA_SIZE];
    float results[2 * DATA_SIZE] = { 0 };

    int i;
    for (i = 0; i < DATA_SIZE;i++)
    {
        inputDataA[i] = (float)i;
    }
    clock_t start, ends;

    /* --------------------- Get platform ---------------------*/
    cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
    assert(clResult == CL_SUCCESS);

    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
    clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
    assert(clResult == CL_SUCCESS);
    /* --------------------- ------------ ---------------------*/


    /* --------------------- Get devices ---------------------*/
    cl_device_id* devices = NULL;
    cl_uint num_devices;

    clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    assert(clResult == CL_SUCCESS);

    devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);

    if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
    {
        printf("could not find device id");
    }
    assert(clResult == CL_SUCCESS);
    /* --------------------- ----------- ---------------------*/
    properties[0] = CL_CONTEXT_PLATFORM;
    properties[1] = 0;
    cl_int contextResult;
    context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
    assert(contextResult == CL_SUCCESS);
    // create command queue using the context and device

    // create command queue using the context and device
    command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);

    // create a program from the kernel source code
    program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);

    // compile the program
    if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
    {
        printf("Error building program\n");
        return 1;
    }

    // specify which kernel from the program to execute
    kernel = clCreateKernel(program, "add", &err);

    // create buffers for the input and ouput

    inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
    inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);

    // load data into the input buffer
    clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
    clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
    clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);

    int temp = DATA_SIZE;

    start = clock();

    // set the argument list for the kernel command
    clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
    assert(clResult == CL_SUCCESS);

    clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    assert(clResult == CL_SUCCESS);

    clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
    assert(clResult == CL_SUCCESS);
    clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
    assert(clResult == CL_SUCCESS);

    global = DATA_SIZE;
    loc = 256;
    // enqueue the kernel command for execution
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
    clFinish(command_queue);

    // copy the results from out of the output buffer
    clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
    //clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);

    // print the results
    printf("output: ");

    for (i = 0;i < 5; i++)
    {
        printf("%f \n", results[i]);
        outfile << results[i] << " ";
    }
    ends = clock();
    double time_taken = ((double)(ends - start)) / CLK_TCK;
    outfile << endl << "Time taken is : " << time_taken << endl;

    clReleaseMemObject(inputA);
    clReleaseMemObject(inputB);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
    return 0;
}

提前致谢

Answer 1

我发现了你的错误。对我来说，它首先不会编译 OpenCL C 代码，所以我使用

进行调试

char info[1024];
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 1024*sizeof(char), (void*)info, NULL); // print build log
printf(info);

获取构建日志：

<kernel>:4:8: warning: multi-character character constant
printf('0',thid);
       ^
<kernel>:4:8: warning: incompatible integer to pointer conversion passing 'int' to parameter of type '__constant char *'
printf('190',thid);
       ^~~~
cl_kernel.h:4694:32: note: passing argument to parameter here
printf(constant char * restrict, ...) __asm("llvm.nvvm.internal.printf.cl");

问题似乎是 ' 而不是 \"。更改此行：

"printf(\"%d\",thid); \n"

然后 OpenCL C 代码编译，我可以重现 CL_INVALID_VALUE 错误。

问题出在这里：您使用 clEnqueueWriteBuffer 将数据从 inputB 复制到 0。您需要添加 C++ 数组以将数据复制到：

long inputDataA[DATA_SIZE];
long inputDataB[DATA_SIZE];
long outputData[DATA_SIZE];

和

clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataB, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // WORKS NOW
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, outputData, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);

然后它工作了，我得到了这个输出：

Setup
0

>> start parallel ----------
25625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035101234567891011121314151617181920212223242526272829303138438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441564656667686970717273747576777879808182838485868788899091929394953523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823839697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612741641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863932333435363738394041424344454647484950515253545556575859606162634484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784798328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628635445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745751921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222234804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105118648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948955125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425431281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581598008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308315765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066072242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542559609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909916406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706711601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901917687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987997367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667679929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210236726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027039289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589597047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347358968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269271 - 0
2 - 1
3 - 2
4 - 4
5 - 6
6 - 9
7 - 12

>>finished parallel in 0.023000 seconds

-------------------------------------

>> start sequential ----------

>> finished sequential in 0.000000
0 - 1
1 - 3
2 - 6
3 - 10
4 - 15
5 - 21
6 - 28
7 - 36

另请注意，在 OpenCL C 中，long = 64 位整数，但在 C++ 中，long =“至少”32 位整数，无论出于何种愚蠢的原因。在 C++ 中，您应该使用 long long int，因为这始终是 64 位整数。例如，您可以使用 typedef int64_t slong;，其中 int64_t 本身是 long long int.

的类型定义

另一个问题是程序不是确定性的。多次执行时，每次都会得到不同的结果。必须存在一些竞争条件。我想您错误地认为 barrier(CLK_GLOBAL_MEM_FENCE); 提供了所有线程的全局同步，但事实并非如此。唯一的全局同步就是在需要的同步点把内核拆分成多个内核，一个接一个地执行。

最后，为了使 C++ 中的 OpenCL 开发更容易，并防止在这种简单的错误上浪费时间，我创建了一个轻量级 OpenCL-Wrapper 来消除所有 OpenCL 代码开销。有了这个，你的代码就短了 4 倍并且更容易理解：

#include "opencl.hpp"

#define DATA_SIZE 1024

int main() {
    Clock clock;
    clock.start();

    Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device

    Memory<slong> arr(device, DATA_SIZE, 1u, true, false);
    Memory<slong> inputA(device, DATA_SIZE);
    Memory<slong> inputB(device, DATA_SIZE);
    Memory<slong> output(device, DATA_SIZE);

    for(int i=1; i<DATA_SIZE-1; i++) {
        inputA[i-1] = (slong)i;
        arr[i-1] = (slong)i;
    }
    inputA.write_to_device();
    
    Kernel kernel(device, DATA_SIZE, "add", inputA, output, inputB);
    kernel.add_constants(DATA_SIZE);

    kernel.run();
    output.read_from_device();

    double time_taken = clock.stop();

    // print the results
    for(int k=1; k<8; k++) {
        printf("%d - ", k);
        printf("%d \n", output[k]);
    }
    printf("\n>>finished parallel in %lf seconds\n", time_taken);
    printf("\n-------------------------------------\n");
    printf("\n>> start sequential ---------- \n");
    long prefixSum[DATA_SIZE] = { 0 };

    clock.start();
    prefixSum[0] = arr[0];
    for(long idx=1; idx<DATA_SIZE; idx++) {
        prefixSum[idx] = prefixSum[idx-1]+arr[idx];
    }
    double seqTime = clock.stop();

    printf("\n>> finished sequential in %lf\n", seqTime);
    for(int j=0; j<8; j++) {
        printf("%d - ", j);
        printf("%d \n", prefixSum[j]);
    }

    wait();
    return 0;
}

#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################

kernel void add(__global long* input, __global long* output, __global long* temp, int size) {
    int thid = get_global_id(0);
    int offset = 1;
    printf("%d",thid);
    temp[2*thid] = input[2*thid];
    temp[2*thid+1] = input[2*thid+1];
    for(int d= size>>1; d>0; d >>= 1) {
        barrier(CLK_GLOBAL_MEM_FENCE);
        if(thid < d) {
            int ai = offset*(2*thid + 1)-1;
            int bi = offset*(2*thid + 2)-1;
            temp[bi] += temp[ai];
        }
        offset = offset*2;
    }
    temp[size-1] = 0;
    barrier(CLK_GLOBAL_MEM_FENCE);
    for(int d = 1; d<size; d *= 2) {
        offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE);
        if(thid < d) {
            int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1;
            long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t;
        }
    }
    barrier(CLK_GLOBAL_MEM_FENCE);
    output[2*thid] = temp[2*thid];
    output[2*thid+1] = temp[2*thid+1];
}

);} // ############################################################### end of OpenCL C code #####################################################################

OpenCL CL_INVALID_VALUE clEnqueueWriteBuffer 错误

OpenCL CL_INVALID_VALUE Error on clEnqueueWriteBuffer

c++

algorithm

opencl