clEnqueueReadBuffer 太慢

clEnqueueReadBuffer is too slow

我正在使用 OpenCL 1.2(NVIDIA 的 SDK)进行一些实时光线追踪。现在我遇到了 GPU 和 CPU 之间数据传输非常慢的问题。最关心的部分是在执行每一帧时将输出数据从 GPU 传输回主机。我使用 clEnqueueReadBuffer 来读取数据。缓冲区创建为主机数据的副本。读取4*800*600字节(图像维度,RGBA 32bit)大约需要8ms。那是不可接受的速度,我该如何解决?

我也尝试了 clEnqueueMapBuffer,但结果还是一样。

编辑:添加主机代码

struct CL_Sphere
{
    rs::vec4 center,color,rad;
    CL_Sphere(vec3 c, float rad, vec3 cc):center((vec4)c), color((vec4)cc), rad(vec4(rad,0,0,0)){}
};

class CLLib
{
private:
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_program program = NULL;
    cl_platform_id platform_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;
    string source;

    cl_mem memobjInput = NULL;
    cl_mem memobjOutput = NULL;

    cl_kernel kernel = NULL;

    size_t workGroups;
    size_t workItems;
    size_t dimSize;


public:

    size_t inputSize;
    size_t outputSize;
    void* bufferIn;
    void* bufferOut;

    CLLib(string filename, string kernelName)
    {

        /* Get Platform and Device Info */
        ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);


        /* Create OpenCL context */
        context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

        /* Create Command Queue */
        command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

        bool read = readFile(filename, &source);
        if(!read) throw "Failed to read a file";

        size_t source_size = source.length() + 1;
        char* source_str = new char[source_size];
        strcpy_s(source_str, source_size * sizeof(char), source.c_str());

        /* Create Kernel Program from the source */
        program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
            (const size_t *)&source_size, &ret);


        /* Build Kernel Program */
        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

        if (ret == CL_BUILD_PROGRAM_FAILURE) {
            // Determine the size of the log
            size_t log_size;
            clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

            // Allocate memory for the log
            char *log = (char *) malloc(log_size);

            // Get the log
            clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

            // Print the log
            printf("%s\n", log);
        }


        /* Create OpenCL Kernel */

        kernel = clCreateKernel(program, kernelName.c_str(), &ret);

        delete[] source_str;

    }


    void reinitDataContainers(size_t inputSize, size_t outputSize)
    {
        this->inputSize = inputSize;
        this->outputSize = outputSize;

        if(bufferIn){
            free(bufferIn);
        }
        if(bufferOut){
            free(bufferOut);
        }

        bufferIn = malloc(inputSize);
        bufferOut = malloc(outputSize);

        if(memobjInput){
            ret = clReleaseMemObject(memobjInput);
        }
        if(memobjOutput){
            ret = clReleaseMemObject(memobjOutput);
        }


        memobjInput = clCreateBuffer(context, CL_MEM_READ_WRITE, inputSize, 0, &ret);
        memobjOutput = clCreateBuffer(context, CL_MEM_WRITE_ONLY, outputSize, 0, &ret);

    }
    void build(size_t dimSize, size_t workGroups, size_t workItems)
    {
        this->workGroups = workGroups;
        this->workItems = workItems;
        this->dimSize = dimSize;

        clEnqueueWriteBuffer(command_queue, memobjInput, CL_TRUE, 0, inputSize, bufferIn, 0, NULL, NULL);

        /* Set OpenCL Kernel Parameters */
        ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjInput);
        ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjOutput);

    }

    void execute()
    {
        /* Execute OpenCL Kernel */
        ret = clEnqueueNDRangeKernel(command_queue, kernel, dimSize, 0, &workGroups, &workItems, 0, NULL, NULL);

        double curTime = Timer::getTimeNanoSeconds();
        clEnqueueReadBuffer(command_queue, memobjOutput, CL_TRUE, 0, outputSize, bufferOut, 0, NULL, NULL);
        //println("delta: "+ toString(Timer::getTimeNanoSeconds() - curTime));
    }


    void release()
    {
        /* Finalization */
        ret = clFlush(command_queue);
        ret = clFinish(command_queue);
        ret = clReleaseKernel(kernel);
        ret = clReleaseProgram(program);
        ret = clReleaseMemObject(memobjInput);
        ret = clReleaseMemObject(memobjOutput);
        ret = clReleaseCommandQueue(command_queue);
        ret = clReleaseContext(context);

        free(bufferIn);
        free(bufferOut);
    }
}

正如我在评论中所说,您可能同时测量内核执行和读取数据,并认为读取数据需要很长时间。以下是您应该如何正确测量它:

void execute()
{
    /* Execute OpenCL Kernel */
    ret = clEnqueueNDRangeKernel(command_queue, kernel, dimSize, 0, &workGroups, &workItems, 0, NULL, NULL);
    clFinish(command_queue);
    double curTime = Timer::getTimeNanoSeconds();
    clEnqueueReadBuffer(command_queue, memobjOutput, CL_TRUE, 0, outputSize, bufferOut, 0, NULL, NULL);
    double curTime2 = Timer::getTimeNanoSeconds();
    println("delta kernel: "+ toString(curTime - curTime2));
    println("delta data read: "+ toString(Timer::getTimeNanoSeconds() - curTime2));
}

通常 clFinish 在阻塞读取之前是多余的,但它有助于获得正确的内核时序。