OpenCL 缓冲区为空
OpenCL buffer is empty
我正在尝试通过本教程学习如何使用 OpenCL https://anteru.net/blog/2012/11/04/2016/index.html 但是我不认为浮点缓冲区中的值被设置为任何值。当我在最后读取缓冲区时,它全是 0,带有科学记数法的十进制数字,就像它充满了随机内存一样。我将 post 下面的代码。内核所做的是接受 3 个参数,float buffer x,float buffer y 和 float a。 const int i = get_global_id(0)
和 y[i] += a * x[i];
问题是(我认为)我从来没有在 aBuffer 或 bBuffer 中放入任何数字,所以乘法和加法没有任何意义。但奇怪的是,当我让内核执行此操作时 y[i] += a;
它仍然提供相同的输出,而我认为它会提供 2。
main.cpp:
#include <iostream>
#include <vector>
#ifdef __APPLE__
#include "OpenCL/opencl.h"
#else
#include "CL/cl.h"
#endif
using namespace std;
int main(int argc, const char * argv[]) {
cl_uint platformIdCount = 0;
clGetPlatformIDs(0, nullptr, &platformIdCount);
vector<cl_platform_id> platformIds(platformIdCount);
clGetPlatformIDs(platformIdCount, platformIds.data(), nullptr);
cout << "Platforms " << platformIdCount << endl;
cl_uint deviceIdCount = 0;
clGetDeviceIDs(platformIds[0], CL_DEVICE_TYPE_GPU, 0, nullptr, &deviceIdCount);
cout << "Devices " << deviceIdCount << endl;
vector<cl_device_id> deviceIds(deviceIdCount);
clGetDeviceIDs(platformIds[0], CL_DEVICE_TYPE_GPU, deviceIdCount, deviceIds.data(), nullptr);
const cl_context_properties contextProperties[] = {
CL_CONTEXT_PLATFORM,
reinterpret_cast<cl_context_properties>(platformIds[0]),
0,0
};
cl_int error = 0;
cl_context context = clCreateContext(contextProperties, deviceIdCount, deviceIds.data(), nullptr, nullptr, &error);
error = 0;
cl_mem aBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * (64), nullptr, &error);
cl_mem bBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * (64), nullptr, &error);
cl_program program;
clBuildProgram(program, deviceIdCount, deviceIds.data(), nullptr, nullptr, nullptr);
cl_kernel kernel1 = clCreateKernel(program, "SAXPY", &error);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), aBuffer);
clSetKernelArg(kernel1, 1, sizeof(cl_mem), bBuffer);
static const float two = 2.0f;
clSetKernelArg(kernel1, 2, sizeof(float),&two);
const size_t globalWorkSize [] = {64,0,0};
cl_command_queue queue;
clEnqueueNDRangeKernel(queue, kernel1, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr);
float done[64];
clEnqueueReadBuffer(queue, bBuffer, CL_TRUE, 0, sizeof(float)*64, done, 0, nullptr, nullptr);
for (int a = 0; a < 64; a++) {
cout << done[a] << endl;
}
clReleaseContext(context);
return 0;
}
.cl 文件:
kernel void SAXPY(__global float* x,__global float* y, float a){
const int i = get_global_id(0);
//y[i] = 2.0f;
y[i] += a * x[i];
}
首先,设置内核参数时,必须pass pointer内存对象:
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &aBuffer); // &aBuffer, not aBuffer
clSetKernelArg(kernel1, 1, sizeof(cl_mem), &bBuffer); // &bBuffer, not bBuffer
其次,你没有create command queue:
cl_command_queue queue = clCreateCommandQueue(context, deviceIds[0], 0, nullptr);
第三,您没有在 clBuildProgram()
之前调用 clCreateProgramWithSource()
。
此外,尝试初始化 cl_mem
个对象:
cl_float* mem = (cl_float*) malloc(sizeof(cl_float)*64);
for(int i=0; i<64; i++)
mem[i] = i;
cl_mem aBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * (64), mem, &error);
cl_mem bBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * (64), mem, &error);
free(mem);
固定码:
#include <iostream>
#include <vector>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include "CL/cl.h"
#endif
using namespace std;
int main(int argc, const char * argv[]) {
cl_uint platformIdCount = 0;
clGetPlatformIDs(0, nullptr, &platformIdCount);
vector<cl_platform_id> platformIds(platformIdCount);
clGetPlatformIDs(platformIdCount, platformIds.data(), nullptr);
cl_uint deviceIdCount = 0;
clGetDeviceIDs(platformIds[0], CL_DEVICE_TYPE_GPU, 0, nullptr, &deviceIdCount);
vector<cl_device_id> deviceIds(deviceIdCount);
clGetDeviceIDs(platformIds[0], CL_DEVICE_TYPE_GPU, deviceIdCount, deviceIds.data(), nullptr);
const cl_context_properties contextProperties[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)platformIds[0],
0
};
cl_int error = 0;
cl_context context = clCreateContext(contextProperties, 1, &deviceIds[0], [](const char* errinfo, const void* private_info, size_t cb, void* user_data) -> void {
/* context-creation and runtime error handler */
cout << "Context error: " << errinfo << endl;
}, nullptr, &error);
cl_float* mem = (cl_float*) malloc(sizeof(cl_float)*64);
for(int i=0; i<64; i++)
mem[i] = i;
cl_mem aBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * (64), mem, &error);
cl_mem bBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * (64), mem, &error);
free(mem);
cl_program program;
string src = "__kernel void SAXPY(__global float* x, __global float* y, float a){"
"size_t i=get_global_id(0);"
"y[i]=a*x[i];"
"}";
const char* sources[] = {src.c_str()};
const size_t lens[] = {src.length()};
program = clCreateProgramWithSource(context, 1, sources, lens, &error);
clBuildProgram(program, 1, &deviceIds[0], nullptr, nullptr, nullptr);
cl_kernel kernel1 = clCreateKernel(program, "SAXPY", &error);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &aBuffer);
clSetKernelArg(kernel1, 1, sizeof(cl_mem), &bBuffer);
static const float two = 2.0f;
clSetKernelArg(kernel1, 2, sizeof(float),&two);
const size_t globalWorkSize [] = {64,0,0};
cl_command_queue queue = clCreateCommandQueue(context, deviceIds[0], 0, nullptr);
clEnqueueNDRangeKernel(queue, kernel1, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr);
float done[64];
clEnqueueReadBuffer(queue, bBuffer, CL_TRUE, 0, sizeof(float)*64, done, 0, nullptr, nullptr);
for (int a = 0; a < 64; a++)
cout << done[a] << endl;
clReleaseContext(context);
return 0;
}
我正在尝试通过本教程学习如何使用 OpenCL https://anteru.net/blog/2012/11/04/2016/index.html 但是我不认为浮点缓冲区中的值被设置为任何值。当我在最后读取缓冲区时,它全是 0,带有科学记数法的十进制数字,就像它充满了随机内存一样。我将 post 下面的代码。内核所做的是接受 3 个参数,float buffer x,float buffer y 和 float a。 const int i = get_global_id(0)
和 y[i] += a * x[i];
问题是(我认为)我从来没有在 aBuffer 或 bBuffer 中放入任何数字,所以乘法和加法没有任何意义。但奇怪的是,当我让内核执行此操作时 y[i] += a;
它仍然提供相同的输出,而我认为它会提供 2。
main.cpp:
#include <iostream>
#include <vector>
#ifdef __APPLE__
#include "OpenCL/opencl.h"
#else
#include "CL/cl.h"
#endif
using namespace std;
int main(int argc, const char * argv[]) {
cl_uint platformIdCount = 0;
clGetPlatformIDs(0, nullptr, &platformIdCount);
vector<cl_platform_id> platformIds(platformIdCount);
clGetPlatformIDs(platformIdCount, platformIds.data(), nullptr);
cout << "Platforms " << platformIdCount << endl;
cl_uint deviceIdCount = 0;
clGetDeviceIDs(platformIds[0], CL_DEVICE_TYPE_GPU, 0, nullptr, &deviceIdCount);
cout << "Devices " << deviceIdCount << endl;
vector<cl_device_id> deviceIds(deviceIdCount);
clGetDeviceIDs(platformIds[0], CL_DEVICE_TYPE_GPU, deviceIdCount, deviceIds.data(), nullptr);
const cl_context_properties contextProperties[] = {
CL_CONTEXT_PLATFORM,
reinterpret_cast<cl_context_properties>(platformIds[0]),
0,0
};
cl_int error = 0;
cl_context context = clCreateContext(contextProperties, deviceIdCount, deviceIds.data(), nullptr, nullptr, &error);
error = 0;
cl_mem aBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * (64), nullptr, &error);
cl_mem bBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * (64), nullptr, &error);
cl_program program;
clBuildProgram(program, deviceIdCount, deviceIds.data(), nullptr, nullptr, nullptr);
cl_kernel kernel1 = clCreateKernel(program, "SAXPY", &error);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), aBuffer);
clSetKernelArg(kernel1, 1, sizeof(cl_mem), bBuffer);
static const float two = 2.0f;
clSetKernelArg(kernel1, 2, sizeof(float),&two);
const size_t globalWorkSize [] = {64,0,0};
cl_command_queue queue;
clEnqueueNDRangeKernel(queue, kernel1, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr);
float done[64];
clEnqueueReadBuffer(queue, bBuffer, CL_TRUE, 0, sizeof(float)*64, done, 0, nullptr, nullptr);
for (int a = 0; a < 64; a++) {
cout << done[a] << endl;
}
clReleaseContext(context);
return 0;
}
.cl 文件:
kernel void SAXPY(__global float* x,__global float* y, float a){
const int i = get_global_id(0);
//y[i] = 2.0f;
y[i] += a * x[i];
}
首先,设置内核参数时,必须pass pointer内存对象:
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &aBuffer); // &aBuffer, not aBuffer
clSetKernelArg(kernel1, 1, sizeof(cl_mem), &bBuffer); // &bBuffer, not bBuffer
其次,你没有create command queue:
cl_command_queue queue = clCreateCommandQueue(context, deviceIds[0], 0, nullptr);
第三,您没有在 clBuildProgram()
之前调用 clCreateProgramWithSource()
。
此外,尝试初始化 cl_mem
个对象:
cl_float* mem = (cl_float*) malloc(sizeof(cl_float)*64);
for(int i=0; i<64; i++)
mem[i] = i;
cl_mem aBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * (64), mem, &error);
cl_mem bBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * (64), mem, &error);
free(mem);
固定码:
#include <iostream>
#include <vector>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include "CL/cl.h"
#endif
using namespace std;
int main(int argc, const char * argv[]) {
cl_uint platformIdCount = 0;
clGetPlatformIDs(0, nullptr, &platformIdCount);
vector<cl_platform_id> platformIds(platformIdCount);
clGetPlatformIDs(platformIdCount, platformIds.data(), nullptr);
cl_uint deviceIdCount = 0;
clGetDeviceIDs(platformIds[0], CL_DEVICE_TYPE_GPU, 0, nullptr, &deviceIdCount);
vector<cl_device_id> deviceIds(deviceIdCount);
clGetDeviceIDs(platformIds[0], CL_DEVICE_TYPE_GPU, deviceIdCount, deviceIds.data(), nullptr);
const cl_context_properties contextProperties[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)platformIds[0],
0
};
cl_int error = 0;
cl_context context = clCreateContext(contextProperties, 1, &deviceIds[0], [](const char* errinfo, const void* private_info, size_t cb, void* user_data) -> void {
/* context-creation and runtime error handler */
cout << "Context error: " << errinfo << endl;
}, nullptr, &error);
cl_float* mem = (cl_float*) malloc(sizeof(cl_float)*64);
for(int i=0; i<64; i++)
mem[i] = i;
cl_mem aBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * (64), mem, &error);
cl_mem bBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * (64), mem, &error);
free(mem);
cl_program program;
string src = "__kernel void SAXPY(__global float* x, __global float* y, float a){"
"size_t i=get_global_id(0);"
"y[i]=a*x[i];"
"}";
const char* sources[] = {src.c_str()};
const size_t lens[] = {src.length()};
program = clCreateProgramWithSource(context, 1, sources, lens, &error);
clBuildProgram(program, 1, &deviceIds[0], nullptr, nullptr, nullptr);
cl_kernel kernel1 = clCreateKernel(program, "SAXPY", &error);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &aBuffer);
clSetKernelArg(kernel1, 1, sizeof(cl_mem), &bBuffer);
static const float two = 2.0f;
clSetKernelArg(kernel1, 2, sizeof(float),&two);
const size_t globalWorkSize [] = {64,0,0};
cl_command_queue queue = clCreateCommandQueue(context, deviceIds[0], 0, nullptr);
clEnqueueNDRangeKernel(queue, kernel1, 1, nullptr, globalWorkSize, nullptr, 0, nullptr, nullptr);
float done[64];
clEnqueueReadBuffer(queue, bBuffer, CL_TRUE, 0, sizeof(float)*64, done, 0, nullptr, nullptr);
for (int a = 0; a < 64; a++)
cout << done[a] << endl;
clReleaseContext(context);
return 0;
}