OpenCL CL_INVALID_VALUE clEnqueueWriteBuffer 错误
OpenCL CL_INVALID_VALUE Error on clEnqueueWriteBuffer
我正在尝试在 OpenCL
上制作算法 运行。我正在使用 this 存储库 (Source.cpp
) 作为模板。我现在想将整个程序转换为 long
算法类型,而不是 float
。但是我总是在 CL_INVALID_VALUE
(-30) 处遇到异常
第二个clEnqueueWriteBuffer
。我已经浪费了几个小时没有发现错误,所以也许我已经监督了一些明显的事情(我还没有对 opencl 做太多......)?
我的代码(不工作)
#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>
//#define DATA_SIZE 1024
#define DATA_SIZE 1024
using namespace std;
//$ /f/Tools/OCL_SDK_Light/lib/x86_64/opencl.lib blelloch_scan.cpp
const char* ProgramSource =
"__kernel void add(__global long *input, __global long *output, __global long *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"printf('%d',thid); \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid+1] = input[2*thid+1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid + 1)-1; \n"\
"int bi = offset*(2*thid + 2)-1; \n"\
"temp[bi] += temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; \n"\
"long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid+1] = temp[2*thid+1]; \n"\
"}\n"\
"\n";
/*
*/
int main(void)
{
cl_context context;
cl_context_properties properties[3];
cl_command_queue command_queue;
cl_kernel kernel;
cl_program program;
cl_int err;
cl_uint num_platforms = 0;
cl_platform_id* platforms;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
size_t global, loc;
std::cout << "Setup \n";
long arr[DATA_SIZE];
long inputDataA[DATA_SIZE];
long results[2 * DATA_SIZE];
long i;
for (i = 1; i < DATA_SIZE - 1;i++)
{
inputDataA[i-1] = (long)i;
arr[i-1] = (long)i;
}
clock_t ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/
/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
{
printf("could not find device id");
}
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
cl_int result;
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE , 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // ERROR HERE
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
int temp = DATA_SIZE;
clock_t start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE; // num of processors
loc = 256;
printf("\n>> start parallel ---------- \n");
// enqueue the kernel command for execution
clResult = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
// copy the results from out of the output buffer
clResult = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, results, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clFinish(command_queue);
ends = clock();
// print the results
int k = 1;
for (k = 1;k < 8; k++)
{
printf("%d - ", k);
printf("%d \n", results[k]);
}
double time_taken = ((double)(ends - start)) / CLK_TCK;
printf("\n>>finished parallel in %lf seconds\n", time_taken);
// cleanup - release OpenCL resources
printf("\n-------------------------------------\n");
/* -------sequential ------- */
printf("\n>> start sequential ---------- \n");
long prefixSum[DATA_SIZE] = { 0 };
const clock_t startSequential = clock();
prefixSum[0] = arr[0];
long idx = 1;
for (idx = 1; idx < DATA_SIZE; idx++) {
prefixSum[idx] = prefixSum[idx - 1] + arr[idx];
}
const clock_t endSequential = clock();
double seqTime = ((double)(endSequential - startSequential)) / CLK_TCK;
printf("\n>> finished sequential in %lf\n", seqTime);
for (int j = 0;j < 8; j++)
{
printf("%d - ", j);
printf("%d \n", prefixSum[j]);
}
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
存储库代码(有效):
#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>
#define DATA_SIZE 1024
using namespace std;
ofstream outfile;
const char* ProgramSource =
"__kernel void add(__global float *input, __global float *output, __global float *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid+1] = input[2*thid+1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid + 1)-1; \n"\
"int bi = offset*(2*thid + 2)-1; \n"\
"temp[bi] += temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; \n"\
"float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid+1] = temp[2*thid+1]; \n"\
"}\n"\
"\n";
/*
*/
int main(void)
{
cl_uint num_platforms = 0;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_platform_id* platforms;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms = 0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
outfile.open("shubham.txt");
size_t global, loc;
float inputDataA[DATA_SIZE];
float results[2 * DATA_SIZE] = { 0 };
int i;
for (i = 0; i < DATA_SIZE;i++)
{
inputDataA[i] = (float)i;
}
clock_t start, ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/
/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
{
printf("could not find device id");
}
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
int temp = DATA_SIZE;
start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE;
loc = 256;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
clFinish(command_queue);
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);
// print the results
printf("output: ");
for (i = 0;i < 5; i++)
{
printf("%f \n", results[i]);
outfile << results[i] << " ";
}
ends = clock();
double time_taken = ((double)(ends - start)) / CLK_TCK;
outfile << endl << "Time taken is : " << time_taken << endl;
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
提前致谢
我发现了你的错误。对我来说,它首先不会编译 OpenCL C 代码,所以我使用
进行调试
char info[1024];
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 1024*sizeof(char), (void*)info, NULL); // print build log
printf(info);
获取构建日志:
<kernel>:4:8: warning: multi-character character constant
printf('0',thid);
^
<kernel>:4:8: warning: incompatible integer to pointer conversion passing 'int' to parameter of type '__constant char *'
printf('190',thid);
^~~~
cl_kernel.h:4694:32: note: passing argument to parameter here
printf(constant char * restrict, ...) __asm("llvm.nvvm.internal.printf.cl");
问题似乎是 '
而不是 \"
。更改此行:
"printf(\"%d\",thid); \n"
然后 OpenCL C 代码编译,我可以重现 CL_INVALID_VALUE 错误。
问题出在这里:您使用 clEnqueueWriteBuffer
将数据从 inputB
复制到 0
。您需要添加 C++ 数组以将数据复制到:
long inputDataA[DATA_SIZE];
long inputDataB[DATA_SIZE];
long outputData[DATA_SIZE];
和
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataB, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // WORKS NOW
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, outputData, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
然后它工作了,我得到了这个输出:
Setup
0
>> start parallel ----------
25625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035101234567891011121314151617181920212223242526272829303138438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441564656667686970717273747576777879808182838485868788899091929394953523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823839697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612741641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863932333435363738394041424344454647484950515253545556575859606162634484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784798328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628635445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745751921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222234804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105118648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948955125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425431281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581598008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308315765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066072242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542559609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909916406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706711601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901917687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987997367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667679929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210236726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027039289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589597047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347358968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269271 - 0
2 - 1
3 - 2
4 - 4
5 - 6
6 - 9
7 - 12
>>finished parallel in 0.023000 seconds
-------------------------------------
>> start sequential ----------
>> finished sequential in 0.000000
0 - 1
1 - 3
2 - 6
3 - 10
4 - 15
5 - 21
6 - 28
7 - 36
另请注意,在 OpenCL C 中,long
= 64 位整数,但在 C++ 中,long
=“至少”32 位整数,无论出于何种愚蠢的原因。在 C++ 中,您应该使用 long long int
,因为这始终是 64 位整数。例如,您可以使用 typedef int64_t slong;
,其中 int64_t
本身是 long long int
.
的类型定义
另一个问题是程序不是确定性的。多次执行时,每次都会得到不同的结果。必须存在一些竞争条件。我想您错误地认为 barrier(CLK_GLOBAL_MEM_FENCE);
提供了所有线程的全局同步,但事实并非如此。唯一的全局同步就是在需要的同步点把内核拆分成多个内核,一个接一个地执行。
最后,为了使 C++ 中的 OpenCL 开发更容易,并防止在这种简单的错误上浪费时间,我创建了一个轻量级 OpenCL-Wrapper 来消除所有 OpenCL 代码开销。有了这个,你的代码就短了 4 倍并且更容易理解:
#include "opencl.hpp"
#define DATA_SIZE 1024
int main() {
Clock clock;
clock.start();
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
Memory<slong> arr(device, DATA_SIZE, 1u, true, false);
Memory<slong> inputA(device, DATA_SIZE);
Memory<slong> inputB(device, DATA_SIZE);
Memory<slong> output(device, DATA_SIZE);
for(int i=1; i<DATA_SIZE-1; i++) {
inputA[i-1] = (slong)i;
arr[i-1] = (slong)i;
}
inputA.write_to_device();
Kernel kernel(device, DATA_SIZE, "add", inputA, output, inputB);
kernel.add_constants(DATA_SIZE);
kernel.run();
output.read_from_device();
double time_taken = clock.stop();
// print the results
for(int k=1; k<8; k++) {
printf("%d - ", k);
printf("%d \n", output[k]);
}
printf("\n>>finished parallel in %lf seconds\n", time_taken);
printf("\n-------------------------------------\n");
printf("\n>> start sequential ---------- \n");
long prefixSum[DATA_SIZE] = { 0 };
clock.start();
prefixSum[0] = arr[0];
for(long idx=1; idx<DATA_SIZE; idx++) {
prefixSum[idx] = prefixSum[idx-1]+arr[idx];
}
double seqTime = clock.stop();
printf("\n>> finished sequential in %lf\n", seqTime);
for(int j=0; j<8; j++) {
printf("%d - ", j);
printf("%d \n", prefixSum[j]);
}
wait();
return 0;
}
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
kernel void add(__global long* input, __global long* output, __global long* temp, int size) {
int thid = get_global_id(0);
int offset = 1;
printf("%d",thid);
temp[2*thid] = input[2*thid];
temp[2*thid+1] = input[2*thid+1];
for(int d= size>>1; d>0; d >>= 1) {
barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid + 1)-1;
int bi = offset*(2*thid + 2)-1;
temp[bi] += temp[ai];
}
offset = offset*2;
}
temp[size-1] = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
for(int d = 1; d<size; d *= 2) {
offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1;
long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t;
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
output[2*thid] = temp[2*thid];
output[2*thid+1] = temp[2*thid+1];
}
);} // ############################################################### end of OpenCL C code #####################################################################
我正在尝试在 OpenCL
上制作算法 运行。我正在使用 this 存储库 (Source.cpp
) 作为模板。我现在想将整个程序转换为 long
算法类型,而不是 float
。但是我总是在 CL_INVALID_VALUE
(-30) 处遇到异常
第二个clEnqueueWriteBuffer
。我已经浪费了几个小时没有发现错误,所以也许我已经监督了一些明显的事情(我还没有对 opencl 做太多......)?
我的代码(不工作)
#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>
//#define DATA_SIZE 1024
#define DATA_SIZE 1024
using namespace std;
//$ /f/Tools/OCL_SDK_Light/lib/x86_64/opencl.lib blelloch_scan.cpp
const char* ProgramSource =
"__kernel void add(__global long *input, __global long *output, __global long *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"printf('%d',thid); \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid+1] = input[2*thid+1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid + 1)-1; \n"\
"int bi = offset*(2*thid + 2)-1; \n"\
"temp[bi] += temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; \n"\
"long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid+1] = temp[2*thid+1]; \n"\
"}\n"\
"\n";
/*
*/
int main(void)
{
cl_context context;
cl_context_properties properties[3];
cl_command_queue command_queue;
cl_kernel kernel;
cl_program program;
cl_int err;
cl_uint num_platforms = 0;
cl_platform_id* platforms;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
size_t global, loc;
std::cout << "Setup \n";
long arr[DATA_SIZE];
long inputDataA[DATA_SIZE];
long results[2 * DATA_SIZE];
long i;
for (i = 1; i < DATA_SIZE - 1;i++)
{
inputDataA[i-1] = (long)i;
arr[i-1] = (long)i;
}
clock_t ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/
/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
{
printf("could not find device id");
}
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
cl_int result;
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE , 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // ERROR HERE
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
int temp = DATA_SIZE;
clock_t start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE; // num of processors
loc = 256;
printf("\n>> start parallel ---------- \n");
// enqueue the kernel command for execution
clResult = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
// copy the results from out of the output buffer
clResult = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, results, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clFinish(command_queue);
ends = clock();
// print the results
int k = 1;
for (k = 1;k < 8; k++)
{
printf("%d - ", k);
printf("%d \n", results[k]);
}
double time_taken = ((double)(ends - start)) / CLK_TCK;
printf("\n>>finished parallel in %lf seconds\n", time_taken);
// cleanup - release OpenCL resources
printf("\n-------------------------------------\n");
/* -------sequential ------- */
printf("\n>> start sequential ---------- \n");
long prefixSum[DATA_SIZE] = { 0 };
const clock_t startSequential = clock();
prefixSum[0] = arr[0];
long idx = 1;
for (idx = 1; idx < DATA_SIZE; idx++) {
prefixSum[idx] = prefixSum[idx - 1] + arr[idx];
}
const clock_t endSequential = clock();
double seqTime = ((double)(endSequential - startSequential)) / CLK_TCK;
printf("\n>> finished sequential in %lf\n", seqTime);
for (int j = 0;j < 8; j++)
{
printf("%d - ", j);
printf("%d \n", prefixSum[j]);
}
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
存储库代码(有效):
#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>
#define DATA_SIZE 1024
using namespace std;
ofstream outfile;
const char* ProgramSource =
"__kernel void add(__global float *input, __global float *output, __global float *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid+1] = input[2*thid+1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid + 1)-1; \n"\
"int bi = offset*(2*thid + 2)-1; \n"\
"temp[bi] += temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; \n"\
"float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid+1] = temp[2*thid+1]; \n"\
"}\n"\
"\n";
/*
*/
int main(void)
{
cl_uint num_platforms = 0;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_platform_id* platforms;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms = 0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
outfile.open("shubham.txt");
size_t global, loc;
float inputDataA[DATA_SIZE];
float results[2 * DATA_SIZE] = { 0 };
int i;
for (i = 0; i < DATA_SIZE;i++)
{
inputDataA[i] = (float)i;
}
clock_t start, ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/
/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
{
printf("could not find device id");
}
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
int temp = DATA_SIZE;
start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE;
loc = 256;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
clFinish(command_queue);
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);
// print the results
printf("output: ");
for (i = 0;i < 5; i++)
{
printf("%f \n", results[i]);
outfile << results[i] << " ";
}
ends = clock();
double time_taken = ((double)(ends - start)) / CLK_TCK;
outfile << endl << "Time taken is : " << time_taken << endl;
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
提前致谢
我发现了你的错误。对我来说,它首先不会编译 OpenCL C 代码,所以我使用
进行调试char info[1024];
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 1024*sizeof(char), (void*)info, NULL); // print build log
printf(info);
获取构建日志:
<kernel>:4:8: warning: multi-character character constant
printf('0',thid);
^
<kernel>:4:8: warning: incompatible integer to pointer conversion passing 'int' to parameter of type '__constant char *'
printf('190',thid);
^~~~
cl_kernel.h:4694:32: note: passing argument to parameter here
printf(constant char * restrict, ...) __asm("llvm.nvvm.internal.printf.cl");
问题似乎是 '
而不是 \"
。更改此行:
"printf(\"%d\",thid); \n"
然后 OpenCL C 代码编译,我可以重现 CL_INVALID_VALUE 错误。
问题出在这里:您使用 clEnqueueWriteBuffer
将数据从 inputB
复制到 0
。您需要添加 C++ 数组以将数据复制到:
long inputDataA[DATA_SIZE];
long inputDataB[DATA_SIZE];
long outputData[DATA_SIZE];
和
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataB, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // WORKS NOW
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, outputData, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
然后它工作了,我得到了这个输出:
Setup
0
>> start parallel ----------
25625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035101234567891011121314151617181920212223242526272829303138438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441564656667686970717273747576777879808182838485868788899091929394953523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823839697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612741641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863932333435363738394041424344454647484950515253545556575859606162634484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784798328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628635445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745751921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222234804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105118648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948955125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425431281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581598008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308315765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066072242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542559609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909916406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706711601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901917687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987997367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667679929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210236726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027039289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589597047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347358968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269271 - 0
2 - 1
3 - 2
4 - 4
5 - 6
6 - 9
7 - 12
>>finished parallel in 0.023000 seconds
-------------------------------------
>> start sequential ----------
>> finished sequential in 0.000000
0 - 1
1 - 3
2 - 6
3 - 10
4 - 15
5 - 21
6 - 28
7 - 36
另请注意,在 OpenCL C 中,long
= 64 位整数,但在 C++ 中,long
=“至少”32 位整数,无论出于何种愚蠢的原因。在 C++ 中,您应该使用 long long int
,因为这始终是 64 位整数。例如,您可以使用 typedef int64_t slong;
,其中 int64_t
本身是 long long int
.
另一个问题是程序不是确定性的。多次执行时,每次都会得到不同的结果。必须存在一些竞争条件。我想您错误地认为 barrier(CLK_GLOBAL_MEM_FENCE);
提供了所有线程的全局同步,但事实并非如此。唯一的全局同步就是在需要的同步点把内核拆分成多个内核,一个接一个地执行。
最后,为了使 C++ 中的 OpenCL 开发更容易,并防止在这种简单的错误上浪费时间,我创建了一个轻量级 OpenCL-Wrapper 来消除所有 OpenCL 代码开销。有了这个,你的代码就短了 4 倍并且更容易理解:
#include "opencl.hpp"
#define DATA_SIZE 1024
int main() {
Clock clock;
clock.start();
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
Memory<slong> arr(device, DATA_SIZE, 1u, true, false);
Memory<slong> inputA(device, DATA_SIZE);
Memory<slong> inputB(device, DATA_SIZE);
Memory<slong> output(device, DATA_SIZE);
for(int i=1; i<DATA_SIZE-1; i++) {
inputA[i-1] = (slong)i;
arr[i-1] = (slong)i;
}
inputA.write_to_device();
Kernel kernel(device, DATA_SIZE, "add", inputA, output, inputB);
kernel.add_constants(DATA_SIZE);
kernel.run();
output.read_from_device();
double time_taken = clock.stop();
// print the results
for(int k=1; k<8; k++) {
printf("%d - ", k);
printf("%d \n", output[k]);
}
printf("\n>>finished parallel in %lf seconds\n", time_taken);
printf("\n-------------------------------------\n");
printf("\n>> start sequential ---------- \n");
long prefixSum[DATA_SIZE] = { 0 };
clock.start();
prefixSum[0] = arr[0];
for(long idx=1; idx<DATA_SIZE; idx++) {
prefixSum[idx] = prefixSum[idx-1]+arr[idx];
}
double seqTime = clock.stop();
printf("\n>> finished sequential in %lf\n", seqTime);
for(int j=0; j<8; j++) {
printf("%d - ", j);
printf("%d \n", prefixSum[j]);
}
wait();
return 0;
}
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
kernel void add(__global long* input, __global long* output, __global long* temp, int size) {
int thid = get_global_id(0);
int offset = 1;
printf("%d",thid);
temp[2*thid] = input[2*thid];
temp[2*thid+1] = input[2*thid+1];
for(int d= size>>1; d>0; d >>= 1) {
barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid + 1)-1;
int bi = offset*(2*thid + 2)-1;
temp[bi] += temp[ai];
}
offset = offset*2;
}
temp[size-1] = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
for(int d = 1; d<size; d *= 2) {
offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1;
long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t;
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
output[2*thid] = temp[2*thid];
output[2*thid+1] = temp[2*thid+1];
}
);} // ############################################################### end of OpenCL C code #####################################################################