OpenCL:clCreateProgramWithSource 处的分段错误

OpenCL: Segmentation fault at clCreateProgramWithSource

我是 OpenCL 的新手。我创建了一个简单的工作代码来执行 A = AB + C,并用它来完成我的实际任务,即读取原始图像文件并对其进行去拜耳处理。但是修改后的代码有时会在 clCreateProgramWithSource 处失败,错误代码为 -6。变化很小,我与差异进行了比较,但在我的一生中,我无法弄清楚为什么我在失败的代码中出错。所以这是工作代码:

#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <string.h>
#include <CL/cl.h>

#define LIST_SIZE   1024

int main()
{
    // We need an object to hold the program
    cl_program prog = NULL;
    // First step is to load the kernel in a local memory
    FILE *kfile;
    char *kfilename = "./src/mad_kernel.cl";

    kfile = fopen(kfilename, "r");
    fseek(kfile, 0L, SEEK_END);
    int kfilesize = ftell(kfile) + 1;
    rewind(kfile);
    char *kernel = malloc(kfilesize);
    memset(kernel, 0, kfilesize);
    fread(kernel, 1, kfilesize, kfile);
    fclose(kfile);
    // printf("%s", kernel); // Test passed
    
    // We want to get the platform ID
    cl_platform_id pid = NULL;
    // We need some variables for holding the returned values
    cl_uint num_platforms;
    cl_int ret = clGetPlatformIDs(1, &pid, &num_platforms);
    printf("Line %d err = %d\n", __LINE__, ret);
    
    // We need a cl device ID
    cl_device_id dev_id = NULL;
    cl_uint num_devices;
    ret = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 1, &dev_id, &num_devices);
    printf("Line %d err = %d\n", __LINE__, ret);

    // Creating a queue to pass our tasks
    // ------------------------------------------------------------------------
    // We need a context for the cl application to run in
    cl_context context = clCreateContext(NULL, 1, &dev_id, NULL, NULL, &ret);
    printf("Line %d err = %d\n", __LINE__, ret);

    // We need a command queue to send our commands to
    cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, 0, &ret);
    printf("Line %d err = %d\n", __LINE__, ret);
    // ------------------------------------------------------------------------

    // We need objects to hold the data
    // We are going to perform A = A*B + C
    int *A = malloc(LIST_SIZE * sizeof(int));
    int *B = malloc(LIST_SIZE * sizeof(int));
    int *C = malloc(LIST_SIZE * sizeof(int));

    for (int i = 0; i < LIST_SIZE; i++) {
        A[i] = 2;
        B[i] = 3;
        C[i] = 5;
    }
    
    cl_mem vec_A = clCreateBuffer(context, CL_MEM_READ_WRITE, LIST_SIZE*sizeof(int), 
                        NULL, &ret);
    printf("Line %d err = %d\n", __LINE__, ret);
    cl_mem vec_B = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE*sizeof(int), 
                        NULL, &ret); 
    printf("Line %d err = %d\n", __LINE__, ret);
    cl_mem vec_C = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE*sizeof(int), 
                        NULL, &ret);
    printf("Line %d err = %d\n", __LINE__, ret);


    // Pass A, B and C to the CL memory
    ret = clEnqueueWriteBuffer(cmd_queue, vec_A, CL_TRUE,
                0, LIST_SIZE*sizeof(int), A, 0, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);
    ret = clEnqueueWriteBuffer(cmd_queue, vec_B, CL_TRUE,
                0, LIST_SIZE*sizeof(int), B, 0, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);
    ret = clEnqueueWriteBuffer(cmd_queue, vec_C, CL_TRUE,
                0, LIST_SIZE*sizeof(int), C, 0, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);

    // We're ready to create the kernel now
    prog = clCreateProgramWithSource(context, 1, (const char **)&kernel, 
                (const size_t *)&kfilesize, &ret);
    printf("Line %d err = %d\n", __LINE__, ret);
    ret = clBuildProgram(prog, 1, &dev_id, NULL, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);
    cl_kernel kern = clCreateKernel(prog, "vector_mad", &ret);
    printf("Line %d err = %d\n", __LINE__, ret);

    // Set the created memory as arguments to the kernel
    ret = clSetKernelArg(kern, 0, sizeof(cl_mem), &vec_A);
    printf("Line %d err = %d\n", __LINE__, ret);
    ret = clSetKernelArg(kern, 1, sizeof(cl_mem), &vec_B);
    printf("Line %d err = %d\n", __LINE__, ret);
    ret = clSetKernelArg(kern, 2, sizeof(cl_mem), &vec_C);
    printf("Line %d err = %d\n", __LINE__, ret);

    // Tell the program size of the entire object
    size_t global_item_size = LIST_SIZE;
    // Each chunk size
    size_t local_item_size = 64;

    ret = clEnqueueNDRangeKernel(cmd_queue, kern, 1, NULL, &global_item_size, 
                &local_item_size, 0, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);
    
    // Read memory back from CL
    ret = clEnqueueReadBuffer(cmd_queue, vec_A, CL_TRUE, 0, LIST_SIZE * sizeof(int), A, 
                0, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);

    int flag = 1;
    for (int i = 0; i < LIST_SIZE; i++) {
        // printf("%d\n", A[i]);
        flag &= A[i] == 11;
    }

    if (flag) {
        printf("Pass\n");
    } else {
        printf("Fail\n");
    }

    // Clean up
    ret = clFlush(cmd_queue);
    ret = clFinish(cmd_queue);
    ret = clReleaseKernel(kern);
    ret = clReleaseProgram(prog);
    ret = clReleaseMemObject(vec_A);
    ret = clReleaseMemObject(vec_B);
    ret = clReleaseMemObject(vec_C);
    ret = clReleaseCommandQueue(cmd_queue);
    ret = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    return 0;
}

内核:

__kernel void vector_mad(__global int *A, __global int *B, __global int *C)
{
    int i = get_global_id(0);

    A[i] = A[i] * B[i] + C[i];
}

现在,我想读入其中一个缓冲区中的原始图像并进行初始测试,只需向输出 RGBA 右侧输出一个常量值,因此我将此程序修改为:

#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <string.h>
#include <CL/cl.h>

int main()
{
    // We need an object to hold the program
    cl_program prog = NULL;
    // First step is to load the kernel in a local memory
    FILE *kfile;
    char *kfilename = "./src/mad_kernel.cl";

    kfile = fopen(kfilename, "r");
    fseek(kfile, 0L, SEEK_END);
    int kfilesize = ftell(kfile) + 1;
    rewind(kfile);
    char *kernel = malloc(kfilesize);
    memset(kernel, 0, kfilesize);
    fread(kernel, 1, kfilesize, kfile);
    fclose(kfile);
    // printf("%s", kernel); // Test passed
    
    // We want to get the platform ID
    cl_platform_id pid = NULL;
    // We need some variables for holding the returned values
    cl_uint num_platforms;
    cl_int ret = clGetPlatformIDs(1, &pid, &num_platforms);
    printf("Line %d err = %d\n", __LINE__, ret);
    
    // We need a cl device ID
    cl_device_id dev_id = NULL;
    cl_uint num_devices;
    ret = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 1, &dev_id, &num_devices);
    printf("Line %d err = %d\n", __LINE__, ret);

    // Creating a queue to pass our tasks
    // ------------------------------------------------------------------------
    // We need a context for the cl application to run in
    cl_context context = clCreateContext(NULL, 1, &dev_id, NULL, NULL, &ret);
    printf("Line %d err = %d\n", __LINE__, ret);

    // We need a command queue to send our commands to
    cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, 0, &ret);
    printf("Line %d err = %d\n", __LINE__, ret);
    // ------------------------------------------------------------------------

    size_t imgsize = 16*16;
    // We need objects to hold the data
    // We are going to perform A = A*B + C
    int *A = malloc(imgsize * sizeof(int));
    int *B = malloc(imgsize * sizeof(int));

    //FILE *img = fopen("./data/testimage.raw", "rb");
    //fread(A, sizeof(cl_ushort), imgsize, img);
    
    for (int i = 0; i < 16; i++) {
        for (int j = 0; j < 16; j++) {
            printf("%x ", *(A + (16*i) + j));
        }
        printf("\n");
    }    
    
    cl_mem vec_A = clCreateBuffer(context, CL_MEM_READ_ONLY, imgsize*sizeof(int), 
                        NULL, &ret);
    printf("Line %d err = %d\n", __LINE__, ret);
    cl_mem vec_B = clCreateBuffer(context, CL_MEM_WRITE_ONLY, imgsize*sizeof(int), 
                        NULL, &ret); 
    printf("Line %d err = %d\n", __LINE__, ret);

    // Pass A, B and C to the CL memory
    ret = clEnqueueWriteBuffer(cmd_queue, vec_A, CL_TRUE,
                0, imgsize * sizeof(int), A, 0, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);
    ret = clEnqueueWriteBuffer(cmd_queue, vec_B, CL_TRUE,
                0, imgsize * sizeof(int), B, 0, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);

    // We're ready to create the kernel now
    prog = clCreateProgramWithSource(context, 1, (const char **)&kernel, 
                (const size_t *)&kfilesize, &ret);
    printf("Line %d err = %d\n", __LINE__, ret);
    ret = clBuildProgram(prog, 1, &dev_id, NULL, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);
    cl_kernel kern = clCreateKernel(prog, "vector_mad", &ret);
    printf("Line %d err = %d\n", __LINE__, ret);

    // Set the created memory as arguments to the kernel
    ret = clSetKernelArg(kern, 0, sizeof(cl_mem), &vec_A);
    printf("Line %d err = %d\n", __LINE__, ret);
    ret = clSetKernelArg(kern, 1, sizeof(cl_mem), &vec_B);
    printf("Line %d err = %d\n", __LINE__, ret);

    // Tell the program size of the entire object
    size_t global_item_size = imgsize;
    // Each chunk size
    size_t local_item_size = 64;

    ret = clEnqueueNDRangeKernel(cmd_queue, kern, 1, NULL, &global_item_size, 
                &local_item_size, 0, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);
    
    // Read memory back from CL
    cl_uint *C = malloc(imgsize * sizeof(int));
    ret = clEnqueueReadBuffer(cmd_queue, vec_B, CL_TRUE, 0, imgsize * sizeof(int), C, 
                0, NULL, NULL);
    printf("Line %d err = %d\n", __LINE__, ret);

    for (int i = 0; i < 16; i++) {
        for (int j = 0; j < 16; j++) {
            printf("%x", *(C + (16*i) + j));
        }
        printf("\n");
    }

    // Clean up
    ret = clFlush(cmd_queue);
    ret = clFinish(cmd_queue);
    ret = clReleaseKernel(kern);
    ret = clReleaseProgram(prog);
    ret = clReleaseMemObject(vec_A);
    ret = clReleaseMemObject(vec_B);
    ret = clReleaseCommandQueue(cmd_queue);
    ret = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    return 0;
}

但它没有用,所以我什至注释掉了读取的文件,向量 A 现在只包含垃圾。但是代码在以下行失败:

    prog = clCreateProgramWithSource(context, 1, (const char **)&kernel, 
                (const size_t *)&kfilesize, &ret);

我有一个 Intel CPU 和 NVidia GPU,所以我尝试在两个平台上编译: 在英特尔:

PS E:\OpenCL_SoftISP\image_proc\ocldemo> gcc -g2 -I"C:\Program Files (x86)\IntelSWTools\system_studio_2020\OpenCL\sdk\include" -L"C:\Program Files (x86)\IntelSWTools\system_studio_2020\OpenCL\sdk\lib\x64" -DCL_TARGET_OPENCL_VERSION=120 src/improc.c -o improc -lOpenCL
PS E:\OpenCL_SoftISP\image_proc\ocldemo> .\improc.exe

当我 运行 使用 gdb 时,它在以下位置失败:

Thread 1 hit Breakpoint 1, main () at src\improc.c:81
81          prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(gdb) c
Continuing.

Thread 1 received signal SIGSEGV, Segmentation fault.
0x00007ffcd81699ca in nvopencl64!cuGetExportTable () from C:\Windows\System32\DriverStore\FileRepository\nvamui.inf_amd64_5f5d5675b52c555b\nvopencl64.dll

在 Nvidia 上:

PS E:\OpenCL_SoftISP\image_proc\ocldemo> gcc -g2 -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6\include" -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6\lib\x64" -DCL_TARGET_OPENCL_VERSION=120 src/improc.c -o improc -lOpenCL
PS E:\OpenCL_SoftISP\image_proc\ocldemo> .\improc.exe

当我 运行 使用 GDB 时,它在以下位置中断:

Thread 1 hit Breakpoint 2, main () at src\improc.c:81
81          prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(gdb) s
[New Thread 16688.0x4918]

Thread 1 received signal SIGSEGV, Segmentation fault.
0x00007ffcd81699ca in nvopencl64!cuGetExportTable () from C:\Windows\System32\DriverStore\FileRepository\nvamui.inf_amd64_5f5d5675b52c555b\nvopencl64.dll

现在有趣的部分是,如果我真的读取文件:

    FILE *img = fopen("./data/testimage.raw", "rb");
    fread(A, sizeof(cl_ushort), imgsize, img);

我实际上得到了不同的输出

Line 67 err = 0
Line 70 err = 0
Line 75 err = 0
Line 78 err = 0
Line 83 err = -6
Line 85 err = -44
Line 87 err = -44
Line 91 err = -48
Line 93 err = -48
Line 102 err = -48
Line 108 err = 0

我看到的错误-6是:

CL_OUT_OF_HOST_MEMORY                       -6

这仍然表明我一定在进行一些非法的内存访问。但我要疯了,因为它发生在哪里?

问题是您给它的地址 int kfilesizeclCreateProgramWithSource() 期望地址 size_t。它最终从一个 4 字节整数的地址读取一个 8 字节整数。 IOW 它从堆栈中读取一些垃圾,并认为您正在提供一些(可能)billion-bytes 长源代码。