clEnqueueReadBuffer 太慢
clEnqueueReadBuffer is too slow
我正在使用 OpenCL 1.2(NVIDIA 的 SDK)进行一些实时光线追踪。现在我遇到了 GPU 和 CPU 之间数据传输非常慢的问题。最关心的部分是在执行每一帧时将输出数据从 GPU 传输回主机。我使用 clEnqueueReadBuffer 来读取数据。缓冲区创建为主机数据的副本。读取4*800*600字节(图像维度,RGBA 32bit)大约需要8ms。那是不可接受的速度,我该如何解决?
我也尝试了 clEnqueueMapBuffer,但结果还是一样。
编辑:添加主机代码
struct CL_Sphere
{
rs::vec4 center,color,rad;
CL_Sphere(vec3 c, float rad, vec3 cc):center((vec4)c), color((vec4)cc), rad(vec4(rad,0,0,0)){}
};
class CLLib
{
private:
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_program program = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
string source;
cl_mem memobjInput = NULL;
cl_mem memobjOutput = NULL;
cl_kernel kernel = NULL;
size_t workGroups;
size_t workItems;
size_t dimSize;
public:
size_t inputSize;
size_t outputSize;
void* bufferIn;
void* bufferOut;
CLLib(string filename, string kernelName)
{
/* Get Platform and Device Info */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
/* Create OpenCL context */
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
bool read = readFile(filename, &source);
if(!read) throw "Failed to read a file";
size_t source_size = source.length() + 1;
char* source_str = new char[source_size];
strcpy_s(source_str, source_size * sizeof(char), source.c_str());
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
}
/* Create OpenCL Kernel */
kernel = clCreateKernel(program, kernelName.c_str(), &ret);
delete[] source_str;
}
void reinitDataContainers(size_t inputSize, size_t outputSize)
{
this->inputSize = inputSize;
this->outputSize = outputSize;
if(bufferIn){
free(bufferIn);
}
if(bufferOut){
free(bufferOut);
}
bufferIn = malloc(inputSize);
bufferOut = malloc(outputSize);
if(memobjInput){
ret = clReleaseMemObject(memobjInput);
}
if(memobjOutput){
ret = clReleaseMemObject(memobjOutput);
}
memobjInput = clCreateBuffer(context, CL_MEM_READ_WRITE, inputSize, 0, &ret);
memobjOutput = clCreateBuffer(context, CL_MEM_WRITE_ONLY, outputSize, 0, &ret);
}
void build(size_t dimSize, size_t workGroups, size_t workItems)
{
this->workGroups = workGroups;
this->workItems = workItems;
this->dimSize = dimSize;
clEnqueueWriteBuffer(command_queue, memobjInput, CL_TRUE, 0, inputSize, bufferIn, 0, NULL, NULL);
/* Set OpenCL Kernel Parameters */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjInput);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjOutput);
}
void execute()
{
/* Execute OpenCL Kernel */
ret = clEnqueueNDRangeKernel(command_queue, kernel, dimSize, 0, &workGroups, &workItems, 0, NULL, NULL);
double curTime = Timer::getTimeNanoSeconds();
clEnqueueReadBuffer(command_queue, memobjOutput, CL_TRUE, 0, outputSize, bufferOut, 0, NULL, NULL);
//println("delta: "+ toString(Timer::getTimeNanoSeconds() - curTime));
}
void release()
{
/* Finalization */
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(memobjInput);
ret = clReleaseMemObject(memobjOutput);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(bufferIn);
free(bufferOut);
}
}
正如我在评论中所说,您可能同时测量内核执行和读取数据,并认为读取数据需要很长时间。以下是您应该如何正确测量它:
void execute()
{
/* Execute OpenCL Kernel */
ret = clEnqueueNDRangeKernel(command_queue, kernel, dimSize, 0, &workGroups, &workItems, 0, NULL, NULL);
clFinish(command_queue);
double curTime = Timer::getTimeNanoSeconds();
clEnqueueReadBuffer(command_queue, memobjOutput, CL_TRUE, 0, outputSize, bufferOut, 0, NULL, NULL);
double curTime2 = Timer::getTimeNanoSeconds();
println("delta kernel: "+ toString(curTime - curTime2));
println("delta data read: "+ toString(Timer::getTimeNanoSeconds() - curTime2));
}
通常 clFinish
在阻塞读取之前是多余的,但它有助于获得正确的内核时序。
我正在使用 OpenCL 1.2(NVIDIA 的 SDK)进行一些实时光线追踪。现在我遇到了 GPU 和 CPU 之间数据传输非常慢的问题。最关心的部分是在执行每一帧时将输出数据从 GPU 传输回主机。我使用 clEnqueueReadBuffer 来读取数据。缓冲区创建为主机数据的副本。读取4*800*600字节(图像维度,RGBA 32bit)大约需要8ms。那是不可接受的速度,我该如何解决?
我也尝试了 clEnqueueMapBuffer,但结果还是一样。
编辑:添加主机代码
struct CL_Sphere
{
rs::vec4 center,color,rad;
CL_Sphere(vec3 c, float rad, vec3 cc):center((vec4)c), color((vec4)cc), rad(vec4(rad,0,0,0)){}
};
class CLLib
{
private:
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_program program = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
string source;
cl_mem memobjInput = NULL;
cl_mem memobjOutput = NULL;
cl_kernel kernel = NULL;
size_t workGroups;
size_t workItems;
size_t dimSize;
public:
size_t inputSize;
size_t outputSize;
void* bufferIn;
void* bufferOut;
CLLib(string filename, string kernelName)
{
/* Get Platform and Device Info */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
/* Create OpenCL context */
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
bool read = readFile(filename, &source);
if(!read) throw "Failed to read a file";
size_t source_size = source.length() + 1;
char* source_str = new char[source_size];
strcpy_s(source_str, source_size * sizeof(char), source.c_str());
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
}
/* Create OpenCL Kernel */
kernel = clCreateKernel(program, kernelName.c_str(), &ret);
delete[] source_str;
}
void reinitDataContainers(size_t inputSize, size_t outputSize)
{
this->inputSize = inputSize;
this->outputSize = outputSize;
if(bufferIn){
free(bufferIn);
}
if(bufferOut){
free(bufferOut);
}
bufferIn = malloc(inputSize);
bufferOut = malloc(outputSize);
if(memobjInput){
ret = clReleaseMemObject(memobjInput);
}
if(memobjOutput){
ret = clReleaseMemObject(memobjOutput);
}
memobjInput = clCreateBuffer(context, CL_MEM_READ_WRITE, inputSize, 0, &ret);
memobjOutput = clCreateBuffer(context, CL_MEM_WRITE_ONLY, outputSize, 0, &ret);
}
void build(size_t dimSize, size_t workGroups, size_t workItems)
{
this->workGroups = workGroups;
this->workItems = workItems;
this->dimSize = dimSize;
clEnqueueWriteBuffer(command_queue, memobjInput, CL_TRUE, 0, inputSize, bufferIn, 0, NULL, NULL);
/* Set OpenCL Kernel Parameters */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjInput);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjOutput);
}
void execute()
{
/* Execute OpenCL Kernel */
ret = clEnqueueNDRangeKernel(command_queue, kernel, dimSize, 0, &workGroups, &workItems, 0, NULL, NULL);
double curTime = Timer::getTimeNanoSeconds();
clEnqueueReadBuffer(command_queue, memobjOutput, CL_TRUE, 0, outputSize, bufferOut, 0, NULL, NULL);
//println("delta: "+ toString(Timer::getTimeNanoSeconds() - curTime));
}
void release()
{
/* Finalization */
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(memobjInput);
ret = clReleaseMemObject(memobjOutput);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(bufferIn);
free(bufferOut);
}
}
正如我在评论中所说,您可能同时测量内核执行和读取数据,并认为读取数据需要很长时间。以下是您应该如何正确测量它:
void execute()
{
/* Execute OpenCL Kernel */
ret = clEnqueueNDRangeKernel(command_queue, kernel, dimSize, 0, &workGroups, &workItems, 0, NULL, NULL);
clFinish(command_queue);
double curTime = Timer::getTimeNanoSeconds();
clEnqueueReadBuffer(command_queue, memobjOutput, CL_TRUE, 0, outputSize, bufferOut, 0, NULL, NULL);
double curTime2 = Timer::getTimeNanoSeconds();
println("delta kernel: "+ toString(curTime - curTime2));
println("delta data read: "+ toString(Timer::getTimeNanoSeconds() - curTime2));
}
通常 clFinish
在阻塞读取之前是多余的,但它有助于获得正确的内核时序。