OpenCL 光线追踪器在 CPU 上工作正常但在 GPU 上(总是)不工作

OpenCL ray tracer works fine on CPU but not (always) on GPU

[请看下面的编辑,问题的答案就在那里]

我正在尝试通过研究一个小型光线追踪器来学习 OpenCL(请参阅下面的代码,来自 link)。

我没有“真正的”GPU,我目前使用的是配备 Intel(R) Iris(TM) Graphics 6100 显卡的 macosx 笔记本电脑。

该代码在 CPU 上运行良好,但它在 GPU 上的行为很奇怪。它工作(或不工作)取决于每个像素的样本数(在场景中传播光线后通过像素射出以获得其颜色的光线数)。如果我取少量样本 (64),我可以获得 1280x720 的图片,但如果我取 128 个样本,我只能得到较小的图片。据我了解,样本数量不应该改变任何东西(当然除了图片的质量)。有没有我想念的纯粹与 OpenCL/GPU 相关的东西?

另外,好像是从GPU的内存中提取结果崩溃了:

queue.enqueueReadBuffer(cl_output, CL_TRUE, 0, image_width * image_height * sizeof(cl_float4), cpu_output);

我在这个阶段收到“中止陷阱:6”。

我遗漏了一些东西。


[编辑] 经过一番研究,我发现了一条有趣的线索:图形卡可能会自动中止任务,因为这需要太多时间。这种行为本来可以避免“冻结”屏幕。 话题讨论一下。

你怎么看?

我找不到关闭此行为的方法。你知道怎么做吗?


文件如下:

main.cpp:

// OpenCL based simple sphere path tracer by Sam Lapere, 2016
// based on smallpt by Kevin Beason 
// http://raytracey.blogspot.com 

#include <iostream>
#include <fstream>
#include <vector>
#include <CL\cl.hpp>

using namespace std;
using namespace cl;

const int image_width = 1280;
const int image_height = 720;

cl_float4* cpu_output;
CommandQueue queue;
Device device;
Kernel kernel;
Context context;
Program program;
Buffer cl_output;
Buffer cl_spheres;

// dummy variables are required for memory alignment
// float3 is considered as float4 by OpenCL
struct Sphere
{
    cl_float radius;
    cl_float dummy1;   
    cl_float dummy2;
    cl_float dummy3;
    cl_float3 position;
    cl_float3 color;
    cl_float3 emission;
};

void pickPlatform(Platform& platform, const vector<Platform>& platforms){

    if (platforms.size() == 1) platform = platforms[0];
    else{
        int input = 0;
        cout << "\nChoose an OpenCL platform: ";
        cin >> input;

        // handle incorrect user input
        while (input < 1 || input > platforms.size()){
            cin.clear(); //clear errors/bad flags on cin
            cin.ignore(cin.rdbuf()->in_avail(), '\n'); // ignores exact number of chars in cin buffer
            cout << "No such option. Choose an OpenCL platform: ";
            cin >> input;
        }
        platform = platforms[input - 1];
    }
}

void pickDevice(Device& device, const vector<Device>& devices){

    if (devices.size() == 1) device = devices[0];
    else{
        int input = 0;
        cout << "\nChoose an OpenCL device: ";
        cin >> input;

        // handle incorrect user input
        while (input < 1 || input > devices.size()){
            cin.clear(); //clear errors/bad flags on cin
            cin.ignore(cin.rdbuf()->in_avail(), '\n'); // ignores exact number of chars in cin buffer
            cout << "No such option. Choose an OpenCL device: ";
            cin >> input;
        }
        device = devices[input - 1];
    }
}

void printErrorLog(const Program& program, const Device& device){

    // Get the error log and print to console
    string buildlog = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
    cerr << "Build log:" << std::endl << buildlog << std::endl;

    // Print the error log to a file
    FILE *log = fopen("errorlog.txt", "w");
    fprintf(log, "%s\n", buildlog);
    cout << "Error log saved in 'errorlog.txt'" << endl;
    system("PAUSE");
    exit(1);
}

void initOpenCL()
{
    // Get all available OpenCL platforms (e.g. AMD OpenCL, Nvidia CUDA, Intel OpenCL)
    vector<Platform> platforms;
    Platform::get(&platforms);
    cout << "Available OpenCL platforms : " << endl << endl;
    for (int i = 0; i < platforms.size(); i++)
        cout << "\t" << i + 1 << ": " << platforms[i].getInfo<CL_PLATFORM_NAME>() << endl;

    // Pick one platform
    Platform platform;
    pickPlatform(platform, platforms);
    cout << "\nUsing OpenCL platform: \t" << platform.getInfo<CL_PLATFORM_NAME>() << endl;

    // Get available OpenCL devices on platform
    vector<Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);

    cout << "Available OpenCL devices on this platform: " << endl << endl;
    for (int i = 0; i < devices.size(); i++){
        cout << "\t" << i + 1 << ": " << devices[i].getInfo<CL_DEVICE_NAME>() << endl;
        cout << "\t\tMax compute units: " << devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl;
        cout << "\t\tMax work group size: " << devices[i].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl << endl;
    }

    // Pick one device
    pickDevice(device, devices);
    cout << "\nUsing OpenCL device: \t" << device.getInfo<CL_DEVICE_NAME>() << endl;
    cout << "\t\t\tMax compute units: " << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl;
    cout << "\t\t\tMax work group size: " << device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl;

    // Create an OpenCL context and command queue on that device.
    context = Context(device);
    queue = CommandQueue(context, device);

    // Convert the OpenCL source code to a string
    string source;
    ifstream file("opencl_kernel.cl");
    if (!file){
        cout << "\nNo OpenCL file found!" << endl << "Exiting..." << endl;
        system("PAUSE");
        exit(1);
    }
    while (!file.eof()){
        char line[256];
        file.getline(line, 255);
        source += line;
    }

    const char* kernel_source = source.c_str();

    // Create an OpenCL program by performing runtime source compilation for the chosen device
    program = Program(context, kernel_source);
    cl_int result = program.build({ device });
    if (result) cout << "Error during compilation OpenCL code!!!\n (" << result << ")" << endl;
    if (result == CL_BUILD_PROGRAM_FAILURE) printErrorLog(program, device);

    // Create a kernel (entry point in the OpenCL source program)
    kernel = Kernel(program, "render_kernel");
}

void cleanUp(){
    delete cpu_output;
}

inline float clamp(float x){ return x < 0.0f ? 0.0f : x > 1.0f ? 1.0f : x; }

// convert RGB float in range [0,1] to int in range [0, 255] and perform gamma correction
inline int toInt(float x){ return int(clamp(x) * 255 + .5); }

void saveImage(){
    // write image to PPM file, a very simple image file format
    // PPM files can be opened with IrfanView (download at www.irfanview.com) or GIMP
    FILE *f = fopen("opencl_raytracer.ppm", "w");
    fprintf(f, "P3\n%d %d\n%d\n", image_width, image_height, 255);

    // loop over all pixels, write RGB values
    for (int i = 0; i < image_width * image_height; i++)
        fprintf(f, "%d %d %d ",
        toInt(cpu_output[i].s[0]),
        toInt(cpu_output[i].s[1]),
        toInt(cpu_output[i].s[2]));
}

#define float3(x, y, z) {{x, y, z}}  // macro to replace ugly initializer braces

void initScene(Sphere* cpu_spheres){

    // left wall
    cpu_spheres[0].radius   = 200.0f;
    cpu_spheres[0].position = float3(-200.6f, 0.0f, 0.0f);
    cpu_spheres[0].color    = float3(0.75f, 0.25f, 0.25f);
    cpu_spheres[0].emission = float3(0.0f, 0.0f, 0.0f);

    // right wall
    cpu_spheres[1].radius   = 200.0f;
    cpu_spheres[1].position = float3(200.6f, 0.0f, 0.0f);
    cpu_spheres[1].color    = float3(0.25f, 0.25f, 0.75f);
    cpu_spheres[1].emission = float3(0.0f, 0.0f, 0.0f);

    // floor
    cpu_spheres[2].radius   = 200.0f;
    cpu_spheres[2].position = float3(0.0f, -200.4f, 0.0f);
    cpu_spheres[2].color    = float3(0.9f, 0.8f, 0.7f);
    cpu_spheres[2].emission = float3(0.0f, 0.0f, 0.0f);

    // ceiling
    cpu_spheres[3].radius   = 200.0f;
    cpu_spheres[3].position = float3(0.0f, 200.4f, 0.0f);
    cpu_spheres[3].color    = float3(0.9f, 0.8f, 0.7f);
    cpu_spheres[3].emission = float3(0.0f, 0.0f, 0.0f);

    // back wall
    cpu_spheres[4].radius   = 200.0f;
    cpu_spheres[4].position = float3(0.0f, 0.0f, -200.4f);
    cpu_spheres[4].color    = float3(0.9f, 0.8f, 0.7f);
    cpu_spheres[4].emission = float3(0.0f, 0.0f, 0.0f);

    // front wall 
    cpu_spheres[5].radius   = 200.0f;
    cpu_spheres[5].position = float3(0.0f, 0.0f, 202.0f);
    cpu_spheres[5].color    = float3(0.9f, 0.8f, 0.7f);
    cpu_spheres[5].emission = float3(0.0f, 0.0f, 0.0f);

    // left sphere
    cpu_spheres[6].radius   = 0.16f;
    cpu_spheres[6].position = float3(-0.25f, -0.24f, -0.1f);
    cpu_spheres[6].color    = float3(0.9f, 0.8f, 0.7f);
    cpu_spheres[6].emission = float3(0.0f, 0.0f, 0.0f);

    // right sphere
    cpu_spheres[7].radius   = 0.16f;
    cpu_spheres[7].position = float3(0.25f, -0.24f, 0.1f);
    cpu_spheres[7].color    = float3(0.9f, 0.8f, 0.7f);
    cpu_spheres[7].emission = float3(0.0f, 0.0f, 0.0f);

    // lightsource
    cpu_spheres[8].radius   = 1.0f;
    cpu_spheres[8].position = float3(0.0f, 1.36f, 0.0f);
    cpu_spheres[8].color    = float3(0.0f, 0.0f, 0.0f);
    cpu_spheres[8].emission = float3(9.0f, 8.0f, 6.0f);

}

void main(){

    // initialise OpenCL
    initOpenCL();

    // allocate memory on CPU to hold the rendered image
    cpu_output = new cl_float3[image_width * image_height];

    // initialise scene
    const int sphere_count = 9;
    Sphere cpu_spheres[sphere_count];
    initScene(cpu_spheres);

    // Create buffers on the OpenCL device for the image and the scene
    cl_output = Buffer(context, CL_MEM_WRITE_ONLY, image_width * image_height * sizeof(cl_float3));
    cl_spheres = Buffer(context, CL_MEM_READ_ONLY, sphere_count * sizeof(Sphere));
    queue.enqueueWriteBuffer(cl_spheres, CL_TRUE, 0, sphere_count * sizeof(Sphere), cpu_spheres);

    // specify OpenCL kernel arguments
    kernel.setArg(0, cl_spheres);
    kernel.setArg(1, image_width);
    kernel.setArg(2, image_height);
    kernel.setArg(3, sphere_count);
    kernel.setArg(4, cl_output);

    // every pixel in the image has its own thread or "work item",
    // so the total amount of work items equals the number of pixels
    std::size_t global_work_size = image_width * image_height;
    std::size_t local_work_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device);

    cout << "Kernel work group size: " << local_work_size << endl;

    // Ensure the global work size is a multiple of local work size
     if (global_work_size % local_work_size != 0)
        global_work_size = (global_work_size / local_work_size + 1) * local_work_size;

    cout << "Rendering started..." << endl;

    // launch the kernel
    queue.enqueueNDRangeKernel(kernel, NULL, global_work_size, local_work_size);
    queue.finish();

    cout << "Rendering done! \nCopying output from device to host" << endl;

    // read and copy OpenCL output to CPU
    queue.enqueueReadBuffer(cl_output, CL_TRUE, 0, image_width * image_height * sizeof(cl_float3), cpu_output);

    // save image
    saveImage();
    cout << "Saved image to 'opencl_raytracer.ppm'" << endl;

    // release memory
    cleanUp();

    system("PAUSE");
}

opencl_kernel.cl:

/* OpenCL based simple sphere path tracer by Sam Lapere, 2016*/
/* based on smallpt by Kevin Beason */
/* http://raytracey.blogspot.com */

__constant float EPSILON = 0.00003f; /* required to compensate for limited float precision */
__constant float PI = 3.14159265359f;
__constant int SAMPLES = 128;

typedef struct Ray{
    float3 origin;
    float3 dir;
} Ray;

typedef struct Sphere{
    float radius;
    float3 pos;
    float3 color;
    float3 emission;
} Sphere;

static float get_random(unsigned int *seed0, unsigned int *seed1) {

    /* hash the seeds using bitwise AND operations and bitshifts */
    *seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);  
    *seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);

    unsigned int ires = ((*seed0) << 16) + (*seed1);

    /* use union struct to convert int to float */
    union {
        float f;
        unsigned int ui;
    } res;

    res.ui = (ires & 0x007fffff) | 0x40000000;  /* bitwise AND, bitwise OR */
    return (res.f - 2.0f) / 2.0f;
}

Ray createCamRay(const int x_coord, const int y_coord, const int width, const int height){

    float fx = (float)x_coord / (float)width;  /* convert int in range [0 - width] to float in range [0-1] */
    float fy = (float)y_coord / (float)height; /* convert int in range [0 - height] to float in range [0-1] */

    /* calculate aspect ratio */
    float aspect_ratio = (float)(width) / (float)(height);
    float fx2 = (fx - 0.5f) * aspect_ratio;
    float fy2 = fy - 0.5f;

    /* determine position of pixel on screen */
    float3 pixel_pos = (float3)(fx2, -fy2, 0.0f);

    /* create camera ray*/
    Ray ray;
    ray.origin = (float3)(0.0f, 0.1f, 2.0f); /* fixed camera position */
    ray.dir = normalize(pixel_pos - ray.origin); /* vector from camera to pixel on screen */

    return ray;
}

            /* (__global Sphere* sphere, const Ray* ray) */
float intersect_sphere(const Sphere* sphere, const Ray* ray) /* version using local copy of sphere */
{
    float3 rayToCenter = sphere->pos - ray->origin;
    float b = dot(rayToCenter, ray->dir);
    float c = dot(rayToCenter, rayToCenter) - sphere->radius*sphere->radius;
    float disc = b * b - c;

    if (disc < 0.0f) return 0.0f;
    else disc = sqrt(disc);

    if ((b - disc) > EPSILON) return b - disc;
    if ((b + disc) > EPSILON) return b + disc;

    return 0.0f;
}

bool intersect_scene(__constant Sphere* spheres, const Ray* ray, float* t, int* sphere_id, const int sphere_count)
{
    /* initialise t to a very large number, 
    so t will be guaranteed to be smaller
    when a hit with the scene occurs */

    float inf = 1e20f;
    *t = inf;

    /* check if the ray intersects each sphere in the scene */
    for (int i = 0; i < sphere_count; i++)  {
    
        Sphere sphere = spheres[i]; /* create local copy of sphere */
    
        /* float hitdistance = intersect_sphere(&spheres[i], ray); */
        float hitdistance = intersect_sphere(&sphere, ray);
        /* keep track of the closest intersection and hitobject found so far */
        if (hitdistance != 0.0f && hitdistance < *t) {
            *t = hitdistance;
            *sphere_id = i;
        }
    }
    return *t < inf; /* true when ray interesects the scene */
}


/* the path tracing function */
/* computes a path (starting from the camera) with a defined number of bounces, accumulates light/color at each bounce */
/* each ray hitting a surface will be reflected in a random direction (by randomly sampling the hemisphere above the hitpoint) */
/* small optimisation: diffuse ray directions are calculated using cosine weighted importance sampling */

float3 trace(__constant Sphere* spheres, const Ray* camray, const int sphere_count, const int* seed0, const int* seed1){

    Ray ray = *camray;

    float3 accum_color = (float3)(0.0f, 0.0f, 0.0f);
    float3 mask = (float3)(1.0f, 1.0f, 1.0f);

    for (int bounces = 0; bounces < 8; bounces++){

        float t;   /* distance to intersection */
        int hitsphere_id = 0; /* index of intersected sphere */

        /* if ray misses scene, return background colour */
        if (!intersect_scene(spheres, &ray, &t, &hitsphere_id, sphere_count))
            return accum_color += mask * (float3)(0.15f, 0.15f, 0.25f);

        /* else, we've got a hit! Fetch the closest hit sphere */
        Sphere hitsphere = spheres[hitsphere_id]; /* version with local copy of sphere */

        /* compute the hitpoint using the ray equation */
        float3 hitpoint = ray.origin + ray.dir * t;
    
        /* compute the surface normal and flip it if necessary to face the incoming ray */
        float3 normal = normalize(hitpoint - hitsphere.pos); 
        float3 normal_facing = dot(normal, ray.dir) < 0.0f ? normal : normal * (-1.0f);

        /* compute two random numbers to pick a random point on the hemisphere above the hitpoint*/
        float rand1 = 2.0f * PI * get_random(seed0, seed1);
        float rand2 = get_random(seed0, seed1);
        float rand2s = sqrt(rand2);

        /* create a local orthogonal coordinate frame centered at the hitpoint */
        float3 w = normal_facing;
        float3 axis = fabs(w.x) > 0.1f ? (float3)(0.0f, 1.0f, 0.0f) : (float3)(1.0f, 0.0f, 0.0f);
        float3 u = normalize(cross(axis, w));
        float3 v = cross(w, u);

        /* use the coordinte frame and random numbers to compute the next ray direction */
        float3 newdir = normalize(u * cos(rand1)*rand2s + v*sin(rand1)*rand2s + w*sqrt(1.0f - rand2));

        /* add a very small offset to the hitpoint to prevent self intersection */
        ray.origin = hitpoint + normal_facing * EPSILON;
        ray.dir = newdir;

        /* add the colour and light contributions to the accumulated colour */
        accum_color += mask * hitsphere.emission; 

        /* the mask colour picks up surface colours at each bounce */
        mask *= hitsphere.color; 
    
        /* perform cosine-weighted importance sampling for diffuse surfaces*/
        mask *= dot(newdir, normal_facing); 
    }

    return accum_color;
}

__kernel void render_kernel(__constant Sphere* spheres, const int width, const int height, const int sphere_count, __global float3* output){
    unsigned int work_item_id = get_global_id(0);   /* the unique global id of the work item for the current pixel */
    unsigned int x_coord = work_item_id % width;            /* x-coordinate of the pixel */
    unsigned int y_coord = work_item_id / width;            /* y-coordinate of the pixel */

    /* seeds for random number generator */
    unsigned int seed0 = x_coord;
    unsigned int seed1 = y_coord;

    Ray camray = createCamRay(x_coord, y_coord, width, height);

    /* add the light contribution of each sample and average over all samples*/
    float3 finalcolor = (float3)(0.0f, 0.0f, 0.0f);
    float invSamples = 1.0f / SAMPLES;

    for (int i = 0; i < SAMPLES; i++)
        finalcolor += trace(spheres, &camray, sphere_count, &seed0, &seed1) * invSamples;

    /* store the pixelcolour in the output buffer */
    output[work_item_id] = finalcolor;
}

由于您的程序在 CPU 上正常运行但在 GPU 上运行不正常,这可能意味着您超出了 GPU TDR(超时检测和恢复)计时器。

在 GPU 上进行计算时出现 Abort trap:6 错误的一个原因是将 GPU 锁定到计算模式的时间太长(一个常见的值似乎是 5 秒,但我发现这个数字与资源相矛盾) .出现这种情况看门狗会强行停止并重启显卡驱动,防止卡屏。

这个问题有几个可能的解决方案:

  1. 在无头机器上工作

大多数(如果不是全部)OS 如果没有附加屏幕则不会强制执行 TDR

  1. 切换 GPU 模式

如果您正在使用 Nvidia Tesla GPU,您可以检查是否可以将其切换到 Tesla Compute Cluster 模式。在此模式下,不强制执行 TDR 限制。 AMD GPU 可能有类似的模式,但我不确定。

  1. 更改 TDR 值

这可以在 Windows 下通过将 HKEY_LOCAL_MACHINE -> SYSTEM -> CurrentControlSet -> Control -> GraphicsDrivers 下的 TdrDelayTdrDdiDelay 注册表项编辑为更高的值来完成。请注意不要将数字设置得太高,否则您将无法知道驱动程序是否真的崩溃了。

另请注意,图形驱动程序或 Windows 更新可能会将这些值重置为默认值。

在 Linux 下,TDR 应该已经默认禁用(我知道它在 Ubuntu 18 和 Centos 8 下,但我没有在其他 versions/distros 上测试过),如果无论如何你都有问题你可以在你的Xorg配置中添加Option Interactive "0",如this SO question

中所述

不幸的是,我不知道(也找不到)在 Mac 上执行此操作的方法OS,但是我知道如果您安装了辅助 GPU,则不会强制执行此限制在您的 MacOS 系统中。

  1. 将您的工作分成更小的部分

如果你能设法将你的计算分成更小的块,你可能无法超过 TDR 计时器(例如,2 个计算每个需要 4 秒而不是一个 8 秒),这的可行性取决于你的问题是,但可能是也可能不是一件容易的事。