OpenCL:clCreateProgramWithSource 处的分段错误
OpenCL: Segmentation fault at clCreateProgramWithSource
我是 OpenCL 的新手。我创建了一个简单的工作代码来执行 A = AB + C,并用它来完成我的实际任务,即读取原始图像文件并对其进行去拜耳处理。但是修改后的代码有时会在 clCreateProgramWithSource
处失败,错误代码为 -6。变化很小,我与差异进行了比较,但在我的一生中,我无法弄清楚为什么我在失败的代码中出错。所以这是工作代码:
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <string.h>
#include <CL/cl.h>
#define LIST_SIZE 1024
int main()
{
// We need an object to hold the program
cl_program prog = NULL;
// First step is to load the kernel in a local memory
FILE *kfile;
char *kfilename = "./src/mad_kernel.cl";
kfile = fopen(kfilename, "r");
fseek(kfile, 0L, SEEK_END);
int kfilesize = ftell(kfile) + 1;
rewind(kfile);
char *kernel = malloc(kfilesize);
memset(kernel, 0, kfilesize);
fread(kernel, 1, kfilesize, kfile);
fclose(kfile);
// printf("%s", kernel); // Test passed
// We want to get the platform ID
cl_platform_id pid = NULL;
// We need some variables for holding the returned values
cl_uint num_platforms;
cl_int ret = clGetPlatformIDs(1, &pid, &num_platforms);
printf("Line %d err = %d\n", __LINE__, ret);
// We need a cl device ID
cl_device_id dev_id = NULL;
cl_uint num_devices;
ret = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 1, &dev_id, &num_devices);
printf("Line %d err = %d\n", __LINE__, ret);
// Creating a queue to pass our tasks
// ------------------------------------------------------------------------
// We need a context for the cl application to run in
cl_context context = clCreateContext(NULL, 1, &dev_id, NULL, NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// We need a command queue to send our commands to
cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, 0, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// ------------------------------------------------------------------------
// We need objects to hold the data
// We are going to perform A = A*B + C
int *A = malloc(LIST_SIZE * sizeof(int));
int *B = malloc(LIST_SIZE * sizeof(int));
int *C = malloc(LIST_SIZE * sizeof(int));
for (int i = 0; i < LIST_SIZE; i++) {
A[i] = 2;
B[i] = 3;
C[i] = 5;
}
cl_mem vec_A = clCreateBuffer(context, CL_MEM_READ_WRITE, LIST_SIZE*sizeof(int),
NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
cl_mem vec_B = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE*sizeof(int),
NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
cl_mem vec_C = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE*sizeof(int),
NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// Pass A, B and C to the CL memory
ret = clEnqueueWriteBuffer(cmd_queue, vec_A, CL_TRUE,
0, LIST_SIZE*sizeof(int), A, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clEnqueueWriteBuffer(cmd_queue, vec_B, CL_TRUE,
0, LIST_SIZE*sizeof(int), B, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clEnqueueWriteBuffer(cmd_queue, vec_C, CL_TRUE,
0, LIST_SIZE*sizeof(int), C, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
// We're ready to create the kernel now
prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(const size_t *)&kfilesize, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clBuildProgram(prog, 1, &dev_id, NULL, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
cl_kernel kern = clCreateKernel(prog, "vector_mad", &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// Set the created memory as arguments to the kernel
ret = clSetKernelArg(kern, 0, sizeof(cl_mem), &vec_A);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clSetKernelArg(kern, 1, sizeof(cl_mem), &vec_B);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clSetKernelArg(kern, 2, sizeof(cl_mem), &vec_C);
printf("Line %d err = %d\n", __LINE__, ret);
// Tell the program size of the entire object
size_t global_item_size = LIST_SIZE;
// Each chunk size
size_t local_item_size = 64;
ret = clEnqueueNDRangeKernel(cmd_queue, kern, 1, NULL, &global_item_size,
&local_item_size, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
// Read memory back from CL
ret = clEnqueueReadBuffer(cmd_queue, vec_A, CL_TRUE, 0, LIST_SIZE * sizeof(int), A,
0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
int flag = 1;
for (int i = 0; i < LIST_SIZE; i++) {
// printf("%d\n", A[i]);
flag &= A[i] == 11;
}
if (flag) {
printf("Pass\n");
} else {
printf("Fail\n");
}
// Clean up
ret = clFlush(cmd_queue);
ret = clFinish(cmd_queue);
ret = clReleaseKernel(kern);
ret = clReleaseProgram(prog);
ret = clReleaseMemObject(vec_A);
ret = clReleaseMemObject(vec_B);
ret = clReleaseMemObject(vec_C);
ret = clReleaseCommandQueue(cmd_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
内核:
__kernel void vector_mad(__global int *A, __global int *B, __global int *C)
{
int i = get_global_id(0);
A[i] = A[i] * B[i] + C[i];
}
现在,我想读入其中一个缓冲区中的原始图像并进行初始测试,只需向输出 RGBA 右侧输出一个常量值,因此我将此程序修改为:
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <string.h>
#include <CL/cl.h>
int main()
{
// We need an object to hold the program
cl_program prog = NULL;
// First step is to load the kernel in a local memory
FILE *kfile;
char *kfilename = "./src/mad_kernel.cl";
kfile = fopen(kfilename, "r");
fseek(kfile, 0L, SEEK_END);
int kfilesize = ftell(kfile) + 1;
rewind(kfile);
char *kernel = malloc(kfilesize);
memset(kernel, 0, kfilesize);
fread(kernel, 1, kfilesize, kfile);
fclose(kfile);
// printf("%s", kernel); // Test passed
// We want to get the platform ID
cl_platform_id pid = NULL;
// We need some variables for holding the returned values
cl_uint num_platforms;
cl_int ret = clGetPlatformIDs(1, &pid, &num_platforms);
printf("Line %d err = %d\n", __LINE__, ret);
// We need a cl device ID
cl_device_id dev_id = NULL;
cl_uint num_devices;
ret = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 1, &dev_id, &num_devices);
printf("Line %d err = %d\n", __LINE__, ret);
// Creating a queue to pass our tasks
// ------------------------------------------------------------------------
// We need a context for the cl application to run in
cl_context context = clCreateContext(NULL, 1, &dev_id, NULL, NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// We need a command queue to send our commands to
cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, 0, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// ------------------------------------------------------------------------
size_t imgsize = 16*16;
// We need objects to hold the data
// We are going to perform A = A*B + C
int *A = malloc(imgsize * sizeof(int));
int *B = malloc(imgsize * sizeof(int));
//FILE *img = fopen("./data/testimage.raw", "rb");
//fread(A, sizeof(cl_ushort), imgsize, img);
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 16; j++) {
printf("%x ", *(A + (16*i) + j));
}
printf("\n");
}
cl_mem vec_A = clCreateBuffer(context, CL_MEM_READ_ONLY, imgsize*sizeof(int),
NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
cl_mem vec_B = clCreateBuffer(context, CL_MEM_WRITE_ONLY, imgsize*sizeof(int),
NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// Pass A, B and C to the CL memory
ret = clEnqueueWriteBuffer(cmd_queue, vec_A, CL_TRUE,
0, imgsize * sizeof(int), A, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clEnqueueWriteBuffer(cmd_queue, vec_B, CL_TRUE,
0, imgsize * sizeof(int), B, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
// We're ready to create the kernel now
prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(const size_t *)&kfilesize, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clBuildProgram(prog, 1, &dev_id, NULL, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
cl_kernel kern = clCreateKernel(prog, "vector_mad", &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// Set the created memory as arguments to the kernel
ret = clSetKernelArg(kern, 0, sizeof(cl_mem), &vec_A);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clSetKernelArg(kern, 1, sizeof(cl_mem), &vec_B);
printf("Line %d err = %d\n", __LINE__, ret);
// Tell the program size of the entire object
size_t global_item_size = imgsize;
// Each chunk size
size_t local_item_size = 64;
ret = clEnqueueNDRangeKernel(cmd_queue, kern, 1, NULL, &global_item_size,
&local_item_size, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
// Read memory back from CL
cl_uint *C = malloc(imgsize * sizeof(int));
ret = clEnqueueReadBuffer(cmd_queue, vec_B, CL_TRUE, 0, imgsize * sizeof(int), C,
0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 16; j++) {
printf("%x", *(C + (16*i) + j));
}
printf("\n");
}
// Clean up
ret = clFlush(cmd_queue);
ret = clFinish(cmd_queue);
ret = clReleaseKernel(kern);
ret = clReleaseProgram(prog);
ret = clReleaseMemObject(vec_A);
ret = clReleaseMemObject(vec_B);
ret = clReleaseCommandQueue(cmd_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
但它没有用,所以我什至注释掉了读取的文件,向量 A 现在只包含垃圾。但是代码在以下行失败:
prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(const size_t *)&kfilesize, &ret);
我有一个 Intel CPU 和 NVidia GPU,所以我尝试在两个平台上编译:
在英特尔:
PS E:\OpenCL_SoftISP\image_proc\ocldemo> gcc -g2 -I"C:\Program Files (x86)\IntelSWTools\system_studio_2020\OpenCL\sdk\include" -L"C:\Program Files (x86)\IntelSWTools\system_studio_2020\OpenCL\sdk\lib\x64" -DCL_TARGET_OPENCL_VERSION=120 src/improc.c -o improc -lOpenCL
PS E:\OpenCL_SoftISP\image_proc\ocldemo> .\improc.exe
当我 运行 使用 gdb 时,它在以下位置失败:
Thread 1 hit Breakpoint 1, main () at src\improc.c:81
81 prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(gdb) c
Continuing.
Thread 1 received signal SIGSEGV, Segmentation fault.
0x00007ffcd81699ca in nvopencl64!cuGetExportTable () from C:\Windows\System32\DriverStore\FileRepository\nvamui.inf_amd64_5f5d5675b52c555b\nvopencl64.dll
在 Nvidia 上:
PS E:\OpenCL_SoftISP\image_proc\ocldemo> gcc -g2 -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6\include" -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6\lib\x64" -DCL_TARGET_OPENCL_VERSION=120 src/improc.c -o improc -lOpenCL
PS E:\OpenCL_SoftISP\image_proc\ocldemo> .\improc.exe
当我 运行 使用 GDB 时,它在以下位置中断:
Thread 1 hit Breakpoint 2, main () at src\improc.c:81
81 prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(gdb) s
[New Thread 16688.0x4918]
Thread 1 received signal SIGSEGV, Segmentation fault.
0x00007ffcd81699ca in nvopencl64!cuGetExportTable () from C:\Windows\System32\DriverStore\FileRepository\nvamui.inf_amd64_5f5d5675b52c555b\nvopencl64.dll
现在有趣的部分是,如果我真的读取文件:
FILE *img = fopen("./data/testimage.raw", "rb");
fread(A, sizeof(cl_ushort), imgsize, img);
我实际上得到了不同的输出
Line 67 err = 0
Line 70 err = 0
Line 75 err = 0
Line 78 err = 0
Line 83 err = -6
Line 85 err = -44
Line 87 err = -44
Line 91 err = -48
Line 93 err = -48
Line 102 err = -48
Line 108 err = 0
我看到的错误-6是:
CL_OUT_OF_HOST_MEMORY -6
这仍然表明我一定在进行一些非法的内存访问。但我要疯了,因为它发生在哪里?
问题是您给它的地址 int kfilesize
而 clCreateProgramWithSource()
期望地址 size_t
。它最终从一个 4 字节整数的地址读取一个 8 字节整数。 IOW 它从堆栈中读取一些垃圾,并认为您正在提供一些(可能)billion-bytes 长源代码。
我是 OpenCL 的新手。我创建了一个简单的工作代码来执行 A = AB + C,并用它来完成我的实际任务,即读取原始图像文件并对其进行去拜耳处理。但是修改后的代码有时会在 clCreateProgramWithSource
处失败,错误代码为 -6。变化很小,我与差异进行了比较,但在我的一生中,我无法弄清楚为什么我在失败的代码中出错。所以这是工作代码:
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <string.h>
#include <CL/cl.h>
#define LIST_SIZE 1024
int main()
{
// We need an object to hold the program
cl_program prog = NULL;
// First step is to load the kernel in a local memory
FILE *kfile;
char *kfilename = "./src/mad_kernel.cl";
kfile = fopen(kfilename, "r");
fseek(kfile, 0L, SEEK_END);
int kfilesize = ftell(kfile) + 1;
rewind(kfile);
char *kernel = malloc(kfilesize);
memset(kernel, 0, kfilesize);
fread(kernel, 1, kfilesize, kfile);
fclose(kfile);
// printf("%s", kernel); // Test passed
// We want to get the platform ID
cl_platform_id pid = NULL;
// We need some variables for holding the returned values
cl_uint num_platforms;
cl_int ret = clGetPlatformIDs(1, &pid, &num_platforms);
printf("Line %d err = %d\n", __LINE__, ret);
// We need a cl device ID
cl_device_id dev_id = NULL;
cl_uint num_devices;
ret = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 1, &dev_id, &num_devices);
printf("Line %d err = %d\n", __LINE__, ret);
// Creating a queue to pass our tasks
// ------------------------------------------------------------------------
// We need a context for the cl application to run in
cl_context context = clCreateContext(NULL, 1, &dev_id, NULL, NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// We need a command queue to send our commands to
cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, 0, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// ------------------------------------------------------------------------
// We need objects to hold the data
// We are going to perform A = A*B + C
int *A = malloc(LIST_SIZE * sizeof(int));
int *B = malloc(LIST_SIZE * sizeof(int));
int *C = malloc(LIST_SIZE * sizeof(int));
for (int i = 0; i < LIST_SIZE; i++) {
A[i] = 2;
B[i] = 3;
C[i] = 5;
}
cl_mem vec_A = clCreateBuffer(context, CL_MEM_READ_WRITE, LIST_SIZE*sizeof(int),
NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
cl_mem vec_B = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE*sizeof(int),
NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
cl_mem vec_C = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE*sizeof(int),
NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// Pass A, B and C to the CL memory
ret = clEnqueueWriteBuffer(cmd_queue, vec_A, CL_TRUE,
0, LIST_SIZE*sizeof(int), A, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clEnqueueWriteBuffer(cmd_queue, vec_B, CL_TRUE,
0, LIST_SIZE*sizeof(int), B, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clEnqueueWriteBuffer(cmd_queue, vec_C, CL_TRUE,
0, LIST_SIZE*sizeof(int), C, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
// We're ready to create the kernel now
prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(const size_t *)&kfilesize, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clBuildProgram(prog, 1, &dev_id, NULL, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
cl_kernel kern = clCreateKernel(prog, "vector_mad", &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// Set the created memory as arguments to the kernel
ret = clSetKernelArg(kern, 0, sizeof(cl_mem), &vec_A);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clSetKernelArg(kern, 1, sizeof(cl_mem), &vec_B);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clSetKernelArg(kern, 2, sizeof(cl_mem), &vec_C);
printf("Line %d err = %d\n", __LINE__, ret);
// Tell the program size of the entire object
size_t global_item_size = LIST_SIZE;
// Each chunk size
size_t local_item_size = 64;
ret = clEnqueueNDRangeKernel(cmd_queue, kern, 1, NULL, &global_item_size,
&local_item_size, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
// Read memory back from CL
ret = clEnqueueReadBuffer(cmd_queue, vec_A, CL_TRUE, 0, LIST_SIZE * sizeof(int), A,
0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
int flag = 1;
for (int i = 0; i < LIST_SIZE; i++) {
// printf("%d\n", A[i]);
flag &= A[i] == 11;
}
if (flag) {
printf("Pass\n");
} else {
printf("Fail\n");
}
// Clean up
ret = clFlush(cmd_queue);
ret = clFinish(cmd_queue);
ret = clReleaseKernel(kern);
ret = clReleaseProgram(prog);
ret = clReleaseMemObject(vec_A);
ret = clReleaseMemObject(vec_B);
ret = clReleaseMemObject(vec_C);
ret = clReleaseCommandQueue(cmd_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
内核:
__kernel void vector_mad(__global int *A, __global int *B, __global int *C)
{
int i = get_global_id(0);
A[i] = A[i] * B[i] + C[i];
}
现在,我想读入其中一个缓冲区中的原始图像并进行初始测试,只需向输出 RGBA 右侧输出一个常量值,因此我将此程序修改为:
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <string.h>
#include <CL/cl.h>
int main()
{
// We need an object to hold the program
cl_program prog = NULL;
// First step is to load the kernel in a local memory
FILE *kfile;
char *kfilename = "./src/mad_kernel.cl";
kfile = fopen(kfilename, "r");
fseek(kfile, 0L, SEEK_END);
int kfilesize = ftell(kfile) + 1;
rewind(kfile);
char *kernel = malloc(kfilesize);
memset(kernel, 0, kfilesize);
fread(kernel, 1, kfilesize, kfile);
fclose(kfile);
// printf("%s", kernel); // Test passed
// We want to get the platform ID
cl_platform_id pid = NULL;
// We need some variables for holding the returned values
cl_uint num_platforms;
cl_int ret = clGetPlatformIDs(1, &pid, &num_platforms);
printf("Line %d err = %d\n", __LINE__, ret);
// We need a cl device ID
cl_device_id dev_id = NULL;
cl_uint num_devices;
ret = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 1, &dev_id, &num_devices);
printf("Line %d err = %d\n", __LINE__, ret);
// Creating a queue to pass our tasks
// ------------------------------------------------------------------------
// We need a context for the cl application to run in
cl_context context = clCreateContext(NULL, 1, &dev_id, NULL, NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// We need a command queue to send our commands to
cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, 0, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// ------------------------------------------------------------------------
size_t imgsize = 16*16;
// We need objects to hold the data
// We are going to perform A = A*B + C
int *A = malloc(imgsize * sizeof(int));
int *B = malloc(imgsize * sizeof(int));
//FILE *img = fopen("./data/testimage.raw", "rb");
//fread(A, sizeof(cl_ushort), imgsize, img);
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 16; j++) {
printf("%x ", *(A + (16*i) + j));
}
printf("\n");
}
cl_mem vec_A = clCreateBuffer(context, CL_MEM_READ_ONLY, imgsize*sizeof(int),
NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
cl_mem vec_B = clCreateBuffer(context, CL_MEM_WRITE_ONLY, imgsize*sizeof(int),
NULL, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// Pass A, B and C to the CL memory
ret = clEnqueueWriteBuffer(cmd_queue, vec_A, CL_TRUE,
0, imgsize * sizeof(int), A, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clEnqueueWriteBuffer(cmd_queue, vec_B, CL_TRUE,
0, imgsize * sizeof(int), B, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
// We're ready to create the kernel now
prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(const size_t *)&kfilesize, &ret);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clBuildProgram(prog, 1, &dev_id, NULL, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
cl_kernel kern = clCreateKernel(prog, "vector_mad", &ret);
printf("Line %d err = %d\n", __LINE__, ret);
// Set the created memory as arguments to the kernel
ret = clSetKernelArg(kern, 0, sizeof(cl_mem), &vec_A);
printf("Line %d err = %d\n", __LINE__, ret);
ret = clSetKernelArg(kern, 1, sizeof(cl_mem), &vec_B);
printf("Line %d err = %d\n", __LINE__, ret);
// Tell the program size of the entire object
size_t global_item_size = imgsize;
// Each chunk size
size_t local_item_size = 64;
ret = clEnqueueNDRangeKernel(cmd_queue, kern, 1, NULL, &global_item_size,
&local_item_size, 0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
// Read memory back from CL
cl_uint *C = malloc(imgsize * sizeof(int));
ret = clEnqueueReadBuffer(cmd_queue, vec_B, CL_TRUE, 0, imgsize * sizeof(int), C,
0, NULL, NULL);
printf("Line %d err = %d\n", __LINE__, ret);
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 16; j++) {
printf("%x", *(C + (16*i) + j));
}
printf("\n");
}
// Clean up
ret = clFlush(cmd_queue);
ret = clFinish(cmd_queue);
ret = clReleaseKernel(kern);
ret = clReleaseProgram(prog);
ret = clReleaseMemObject(vec_A);
ret = clReleaseMemObject(vec_B);
ret = clReleaseCommandQueue(cmd_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
但它没有用,所以我什至注释掉了读取的文件,向量 A 现在只包含垃圾。但是代码在以下行失败:
prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(const size_t *)&kfilesize, &ret);
我有一个 Intel CPU 和 NVidia GPU,所以我尝试在两个平台上编译: 在英特尔:
PS E:\OpenCL_SoftISP\image_proc\ocldemo> gcc -g2 -I"C:\Program Files (x86)\IntelSWTools\system_studio_2020\OpenCL\sdk\include" -L"C:\Program Files (x86)\IntelSWTools\system_studio_2020\OpenCL\sdk\lib\x64" -DCL_TARGET_OPENCL_VERSION=120 src/improc.c -o improc -lOpenCL
PS E:\OpenCL_SoftISP\image_proc\ocldemo> .\improc.exe
当我 运行 使用 gdb 时,它在以下位置失败:
Thread 1 hit Breakpoint 1, main () at src\improc.c:81
81 prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(gdb) c
Continuing.
Thread 1 received signal SIGSEGV, Segmentation fault.
0x00007ffcd81699ca in nvopencl64!cuGetExportTable () from C:\Windows\System32\DriverStore\FileRepository\nvamui.inf_amd64_5f5d5675b52c555b\nvopencl64.dll
在 Nvidia 上:
PS E:\OpenCL_SoftISP\image_proc\ocldemo> gcc -g2 -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6\include" -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6\lib\x64" -DCL_TARGET_OPENCL_VERSION=120 src/improc.c -o improc -lOpenCL
PS E:\OpenCL_SoftISP\image_proc\ocldemo> .\improc.exe
当我 运行 使用 GDB 时,它在以下位置中断:
Thread 1 hit Breakpoint 2, main () at src\improc.c:81
81 prog = clCreateProgramWithSource(context, 1, (const char **)&kernel,
(gdb) s
[New Thread 16688.0x4918]
Thread 1 received signal SIGSEGV, Segmentation fault.
0x00007ffcd81699ca in nvopencl64!cuGetExportTable () from C:\Windows\System32\DriverStore\FileRepository\nvamui.inf_amd64_5f5d5675b52c555b\nvopencl64.dll
现在有趣的部分是,如果我真的读取文件:
FILE *img = fopen("./data/testimage.raw", "rb");
fread(A, sizeof(cl_ushort), imgsize, img);
我实际上得到了不同的输出
Line 67 err = 0
Line 70 err = 0
Line 75 err = 0
Line 78 err = 0
Line 83 err = -6
Line 85 err = -44
Line 87 err = -44
Line 91 err = -48
Line 93 err = -48
Line 102 err = -48
Line 108 err = 0
我看到的错误-6是:
CL_OUT_OF_HOST_MEMORY -6
这仍然表明我一定在进行一些非法的内存访问。但我要疯了,因为它发生在哪里?
问题是您给它的地址 int kfilesize
而 clCreateProgramWithSource()
期望地址 size_t
。它最终从一个 4 字节整数的地址读取一个 8 字节整数。 IOW 它从堆栈中读取一些垃圾,并认为您正在提供一些(可能)billion-bytes 长源代码。