OpenCL 光线追踪器在 CPU 上工作正常但在 GPU 上(总是)不工作
OpenCL ray tracer works fine on CPU but not (always) on GPU
[请看下面的编辑,问题的答案就在那里]
我正在尝试通过研究一个小型光线追踪器来学习 OpenCL(请参阅下面的代码,来自 link)。
我没有“真正的”GPU,我目前使用的是配备 Intel(R) Iris(TM) Graphics 6100 显卡的 macosx 笔记本电脑。
该代码在 CPU 上运行良好,但它在 GPU 上的行为很奇怪。它工作(或不工作)取决于每个像素的样本数(在场景中传播光线后通过像素射出以获得其颜色的光线数)。如果我取少量样本 (64),我可以获得 1280x720 的图片,但如果我取 128 个样本,我只能得到较小的图片。据我了解,样本数量不应该改变任何东西(当然除了图片的质量)。有没有我想念的纯粹与 OpenCL/GPU 相关的东西?
另外,好像是从GPU的内存中提取结果崩溃了:
queue.enqueueReadBuffer(cl_output, CL_TRUE, 0, image_width * image_height * sizeof(cl_float4), cpu_output);
我在这个阶段收到“中止陷阱:6”。
我遗漏了一些东西。
[编辑] 经过一番研究,我发现了一条有趣的线索:图形卡可能会自动中止任务,因为这需要太多时间。这种行为本来可以避免“冻结”屏幕。 话题讨论一下。
你怎么看?
我找不到关闭此行为的方法。你知道怎么做吗?
文件如下:
main.cpp:
// OpenCL based simple sphere path tracer by Sam Lapere, 2016
// based on smallpt by Kevin Beason
// http://raytracey.blogspot.com
#include <iostream>
#include <fstream>
#include <vector>
#include <CL\cl.hpp>
using namespace std;
using namespace cl;
const int image_width = 1280;
const int image_height = 720;
cl_float4* cpu_output;
CommandQueue queue;
Device device;
Kernel kernel;
Context context;
Program program;
Buffer cl_output;
Buffer cl_spheres;
// dummy variables are required for memory alignment
// float3 is considered as float4 by OpenCL
struct Sphere
{
cl_float radius;
cl_float dummy1;
cl_float dummy2;
cl_float dummy3;
cl_float3 position;
cl_float3 color;
cl_float3 emission;
};
void pickPlatform(Platform& platform, const vector<Platform>& platforms){
if (platforms.size() == 1) platform = platforms[0];
else{
int input = 0;
cout << "\nChoose an OpenCL platform: ";
cin >> input;
// handle incorrect user input
while (input < 1 || input > platforms.size()){
cin.clear(); //clear errors/bad flags on cin
cin.ignore(cin.rdbuf()->in_avail(), '\n'); // ignores exact number of chars in cin buffer
cout << "No such option. Choose an OpenCL platform: ";
cin >> input;
}
platform = platforms[input - 1];
}
}
void pickDevice(Device& device, const vector<Device>& devices){
if (devices.size() == 1) device = devices[0];
else{
int input = 0;
cout << "\nChoose an OpenCL device: ";
cin >> input;
// handle incorrect user input
while (input < 1 || input > devices.size()){
cin.clear(); //clear errors/bad flags on cin
cin.ignore(cin.rdbuf()->in_avail(), '\n'); // ignores exact number of chars in cin buffer
cout << "No such option. Choose an OpenCL device: ";
cin >> input;
}
device = devices[input - 1];
}
}
void printErrorLog(const Program& program, const Device& device){
// Get the error log and print to console
string buildlog = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
cerr << "Build log:" << std::endl << buildlog << std::endl;
// Print the error log to a file
FILE *log = fopen("errorlog.txt", "w");
fprintf(log, "%s\n", buildlog);
cout << "Error log saved in 'errorlog.txt'" << endl;
system("PAUSE");
exit(1);
}
void initOpenCL()
{
// Get all available OpenCL platforms (e.g. AMD OpenCL, Nvidia CUDA, Intel OpenCL)
vector<Platform> platforms;
Platform::get(&platforms);
cout << "Available OpenCL platforms : " << endl << endl;
for (int i = 0; i < platforms.size(); i++)
cout << "\t" << i + 1 << ": " << platforms[i].getInfo<CL_PLATFORM_NAME>() << endl;
// Pick one platform
Platform platform;
pickPlatform(platform, platforms);
cout << "\nUsing OpenCL platform: \t" << platform.getInfo<CL_PLATFORM_NAME>() << endl;
// Get available OpenCL devices on platform
vector<Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
cout << "Available OpenCL devices on this platform: " << endl << endl;
for (int i = 0; i < devices.size(); i++){
cout << "\t" << i + 1 << ": " << devices[i].getInfo<CL_DEVICE_NAME>() << endl;
cout << "\t\tMax compute units: " << devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl;
cout << "\t\tMax work group size: " << devices[i].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl << endl;
}
// Pick one device
pickDevice(device, devices);
cout << "\nUsing OpenCL device: \t" << device.getInfo<CL_DEVICE_NAME>() << endl;
cout << "\t\t\tMax compute units: " << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl;
cout << "\t\t\tMax work group size: " << device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl;
// Create an OpenCL context and command queue on that device.
context = Context(device);
queue = CommandQueue(context, device);
// Convert the OpenCL source code to a string
string source;
ifstream file("opencl_kernel.cl");
if (!file){
cout << "\nNo OpenCL file found!" << endl << "Exiting..." << endl;
system("PAUSE");
exit(1);
}
while (!file.eof()){
char line[256];
file.getline(line, 255);
source += line;
}
const char* kernel_source = source.c_str();
// Create an OpenCL program by performing runtime source compilation for the chosen device
program = Program(context, kernel_source);
cl_int result = program.build({ device });
if (result) cout << "Error during compilation OpenCL code!!!\n (" << result << ")" << endl;
if (result == CL_BUILD_PROGRAM_FAILURE) printErrorLog(program, device);
// Create a kernel (entry point in the OpenCL source program)
kernel = Kernel(program, "render_kernel");
}
void cleanUp(){
delete cpu_output;
}
inline float clamp(float x){ return x < 0.0f ? 0.0f : x > 1.0f ? 1.0f : x; }
// convert RGB float in range [0,1] to int in range [0, 255] and perform gamma correction
inline int toInt(float x){ return int(clamp(x) * 255 + .5); }
void saveImage(){
// write image to PPM file, a very simple image file format
// PPM files can be opened with IrfanView (download at www.irfanview.com) or GIMP
FILE *f = fopen("opencl_raytracer.ppm", "w");
fprintf(f, "P3\n%d %d\n%d\n", image_width, image_height, 255);
// loop over all pixels, write RGB values
for (int i = 0; i < image_width * image_height; i++)
fprintf(f, "%d %d %d ",
toInt(cpu_output[i].s[0]),
toInt(cpu_output[i].s[1]),
toInt(cpu_output[i].s[2]));
}
#define float3(x, y, z) {{x, y, z}} // macro to replace ugly initializer braces
void initScene(Sphere* cpu_spheres){
// left wall
cpu_spheres[0].radius = 200.0f;
cpu_spheres[0].position = float3(-200.6f, 0.0f, 0.0f);
cpu_spheres[0].color = float3(0.75f, 0.25f, 0.25f);
cpu_spheres[0].emission = float3(0.0f, 0.0f, 0.0f);
// right wall
cpu_spheres[1].radius = 200.0f;
cpu_spheres[1].position = float3(200.6f, 0.0f, 0.0f);
cpu_spheres[1].color = float3(0.25f, 0.25f, 0.75f);
cpu_spheres[1].emission = float3(0.0f, 0.0f, 0.0f);
// floor
cpu_spheres[2].radius = 200.0f;
cpu_spheres[2].position = float3(0.0f, -200.4f, 0.0f);
cpu_spheres[2].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[2].emission = float3(0.0f, 0.0f, 0.0f);
// ceiling
cpu_spheres[3].radius = 200.0f;
cpu_spheres[3].position = float3(0.0f, 200.4f, 0.0f);
cpu_spheres[3].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[3].emission = float3(0.0f, 0.0f, 0.0f);
// back wall
cpu_spheres[4].radius = 200.0f;
cpu_spheres[4].position = float3(0.0f, 0.0f, -200.4f);
cpu_spheres[4].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[4].emission = float3(0.0f, 0.0f, 0.0f);
// front wall
cpu_spheres[5].radius = 200.0f;
cpu_spheres[5].position = float3(0.0f, 0.0f, 202.0f);
cpu_spheres[5].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[5].emission = float3(0.0f, 0.0f, 0.0f);
// left sphere
cpu_spheres[6].radius = 0.16f;
cpu_spheres[6].position = float3(-0.25f, -0.24f, -0.1f);
cpu_spheres[6].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[6].emission = float3(0.0f, 0.0f, 0.0f);
// right sphere
cpu_spheres[7].radius = 0.16f;
cpu_spheres[7].position = float3(0.25f, -0.24f, 0.1f);
cpu_spheres[7].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[7].emission = float3(0.0f, 0.0f, 0.0f);
// lightsource
cpu_spheres[8].radius = 1.0f;
cpu_spheres[8].position = float3(0.0f, 1.36f, 0.0f);
cpu_spheres[8].color = float3(0.0f, 0.0f, 0.0f);
cpu_spheres[8].emission = float3(9.0f, 8.0f, 6.0f);
}
void main(){
// initialise OpenCL
initOpenCL();
// allocate memory on CPU to hold the rendered image
cpu_output = new cl_float3[image_width * image_height];
// initialise scene
const int sphere_count = 9;
Sphere cpu_spheres[sphere_count];
initScene(cpu_spheres);
// Create buffers on the OpenCL device for the image and the scene
cl_output = Buffer(context, CL_MEM_WRITE_ONLY, image_width * image_height * sizeof(cl_float3));
cl_spheres = Buffer(context, CL_MEM_READ_ONLY, sphere_count * sizeof(Sphere));
queue.enqueueWriteBuffer(cl_spheres, CL_TRUE, 0, sphere_count * sizeof(Sphere), cpu_spheres);
// specify OpenCL kernel arguments
kernel.setArg(0, cl_spheres);
kernel.setArg(1, image_width);
kernel.setArg(2, image_height);
kernel.setArg(3, sphere_count);
kernel.setArg(4, cl_output);
// every pixel in the image has its own thread or "work item",
// so the total amount of work items equals the number of pixels
std::size_t global_work_size = image_width * image_height;
std::size_t local_work_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device);
cout << "Kernel work group size: " << local_work_size << endl;
// Ensure the global work size is a multiple of local work size
if (global_work_size % local_work_size != 0)
global_work_size = (global_work_size / local_work_size + 1) * local_work_size;
cout << "Rendering started..." << endl;
// launch the kernel
queue.enqueueNDRangeKernel(kernel, NULL, global_work_size, local_work_size);
queue.finish();
cout << "Rendering done! \nCopying output from device to host" << endl;
// read and copy OpenCL output to CPU
queue.enqueueReadBuffer(cl_output, CL_TRUE, 0, image_width * image_height * sizeof(cl_float3), cpu_output);
// save image
saveImage();
cout << "Saved image to 'opencl_raytracer.ppm'" << endl;
// release memory
cleanUp();
system("PAUSE");
}
opencl_kernel.cl:
/* OpenCL based simple sphere path tracer by Sam Lapere, 2016*/
/* based on smallpt by Kevin Beason */
/* http://raytracey.blogspot.com */
__constant float EPSILON = 0.00003f; /* required to compensate for limited float precision */
__constant float PI = 3.14159265359f;
__constant int SAMPLES = 128;
typedef struct Ray{
float3 origin;
float3 dir;
} Ray;
typedef struct Sphere{
float radius;
float3 pos;
float3 color;
float3 emission;
} Sphere;
static float get_random(unsigned int *seed0, unsigned int *seed1) {
/* hash the seeds using bitwise AND operations and bitshifts */
*seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
*seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
unsigned int ires = ((*seed0) << 16) + (*seed1);
/* use union struct to convert int to float */
union {
float f;
unsigned int ui;
} res;
res.ui = (ires & 0x007fffff) | 0x40000000; /* bitwise AND, bitwise OR */
return (res.f - 2.0f) / 2.0f;
}
Ray createCamRay(const int x_coord, const int y_coord, const int width, const int height){
float fx = (float)x_coord / (float)width; /* convert int in range [0 - width] to float in range [0-1] */
float fy = (float)y_coord / (float)height; /* convert int in range [0 - height] to float in range [0-1] */
/* calculate aspect ratio */
float aspect_ratio = (float)(width) / (float)(height);
float fx2 = (fx - 0.5f) * aspect_ratio;
float fy2 = fy - 0.5f;
/* determine position of pixel on screen */
float3 pixel_pos = (float3)(fx2, -fy2, 0.0f);
/* create camera ray*/
Ray ray;
ray.origin = (float3)(0.0f, 0.1f, 2.0f); /* fixed camera position */
ray.dir = normalize(pixel_pos - ray.origin); /* vector from camera to pixel on screen */
return ray;
}
/* (__global Sphere* sphere, const Ray* ray) */
float intersect_sphere(const Sphere* sphere, const Ray* ray) /* version using local copy of sphere */
{
float3 rayToCenter = sphere->pos - ray->origin;
float b = dot(rayToCenter, ray->dir);
float c = dot(rayToCenter, rayToCenter) - sphere->radius*sphere->radius;
float disc = b * b - c;
if (disc < 0.0f) return 0.0f;
else disc = sqrt(disc);
if ((b - disc) > EPSILON) return b - disc;
if ((b + disc) > EPSILON) return b + disc;
return 0.0f;
}
bool intersect_scene(__constant Sphere* spheres, const Ray* ray, float* t, int* sphere_id, const int sphere_count)
{
/* initialise t to a very large number,
so t will be guaranteed to be smaller
when a hit with the scene occurs */
float inf = 1e20f;
*t = inf;
/* check if the ray intersects each sphere in the scene */
for (int i = 0; i < sphere_count; i++) {
Sphere sphere = spheres[i]; /* create local copy of sphere */
/* float hitdistance = intersect_sphere(&spheres[i], ray); */
float hitdistance = intersect_sphere(&sphere, ray);
/* keep track of the closest intersection and hitobject found so far */
if (hitdistance != 0.0f && hitdistance < *t) {
*t = hitdistance;
*sphere_id = i;
}
}
return *t < inf; /* true when ray interesects the scene */
}
/* the path tracing function */
/* computes a path (starting from the camera) with a defined number of bounces, accumulates light/color at each bounce */
/* each ray hitting a surface will be reflected in a random direction (by randomly sampling the hemisphere above the hitpoint) */
/* small optimisation: diffuse ray directions are calculated using cosine weighted importance sampling */
float3 trace(__constant Sphere* spheres, const Ray* camray, const int sphere_count, const int* seed0, const int* seed1){
Ray ray = *camray;
float3 accum_color = (float3)(0.0f, 0.0f, 0.0f);
float3 mask = (float3)(1.0f, 1.0f, 1.0f);
for (int bounces = 0; bounces < 8; bounces++){
float t; /* distance to intersection */
int hitsphere_id = 0; /* index of intersected sphere */
/* if ray misses scene, return background colour */
if (!intersect_scene(spheres, &ray, &t, &hitsphere_id, sphere_count))
return accum_color += mask * (float3)(0.15f, 0.15f, 0.25f);
/* else, we've got a hit! Fetch the closest hit sphere */
Sphere hitsphere = spheres[hitsphere_id]; /* version with local copy of sphere */
/* compute the hitpoint using the ray equation */
float3 hitpoint = ray.origin + ray.dir * t;
/* compute the surface normal and flip it if necessary to face the incoming ray */
float3 normal = normalize(hitpoint - hitsphere.pos);
float3 normal_facing = dot(normal, ray.dir) < 0.0f ? normal : normal * (-1.0f);
/* compute two random numbers to pick a random point on the hemisphere above the hitpoint*/
float rand1 = 2.0f * PI * get_random(seed0, seed1);
float rand2 = get_random(seed0, seed1);
float rand2s = sqrt(rand2);
/* create a local orthogonal coordinate frame centered at the hitpoint */
float3 w = normal_facing;
float3 axis = fabs(w.x) > 0.1f ? (float3)(0.0f, 1.0f, 0.0f) : (float3)(1.0f, 0.0f, 0.0f);
float3 u = normalize(cross(axis, w));
float3 v = cross(w, u);
/* use the coordinte frame and random numbers to compute the next ray direction */
float3 newdir = normalize(u * cos(rand1)*rand2s + v*sin(rand1)*rand2s + w*sqrt(1.0f - rand2));
/* add a very small offset to the hitpoint to prevent self intersection */
ray.origin = hitpoint + normal_facing * EPSILON;
ray.dir = newdir;
/* add the colour and light contributions to the accumulated colour */
accum_color += mask * hitsphere.emission;
/* the mask colour picks up surface colours at each bounce */
mask *= hitsphere.color;
/* perform cosine-weighted importance sampling for diffuse surfaces*/
mask *= dot(newdir, normal_facing);
}
return accum_color;
}
__kernel void render_kernel(__constant Sphere* spheres, const int width, const int height, const int sphere_count, __global float3* output){
unsigned int work_item_id = get_global_id(0); /* the unique global id of the work item for the current pixel */
unsigned int x_coord = work_item_id % width; /* x-coordinate of the pixel */
unsigned int y_coord = work_item_id / width; /* y-coordinate of the pixel */
/* seeds for random number generator */
unsigned int seed0 = x_coord;
unsigned int seed1 = y_coord;
Ray camray = createCamRay(x_coord, y_coord, width, height);
/* add the light contribution of each sample and average over all samples*/
float3 finalcolor = (float3)(0.0f, 0.0f, 0.0f);
float invSamples = 1.0f / SAMPLES;
for (int i = 0; i < SAMPLES; i++)
finalcolor += trace(spheres, &camray, sphere_count, &seed0, &seed1) * invSamples;
/* store the pixelcolour in the output buffer */
output[work_item_id] = finalcolor;
}
由于您的程序在 CPU 上正常运行但在 GPU 上运行不正常,这可能意味着您超出了 GPU TDR(超时检测和恢复)计时器。
在 GPU 上进行计算时出现 Abort trap:6
错误的一个原因是将 GPU 锁定到计算模式的时间太长(一个常见的值似乎是 5 秒,但我发现这个数字与资源相矛盾) .出现这种情况看门狗会强行停止并重启显卡驱动,防止卡屏。
这个问题有几个可能的解决方案:
- 在无头机器上工作
大多数(如果不是全部)OS 如果没有附加屏幕则不会强制执行 TDR
- 切换 GPU 模式
如果您正在使用 Nvidia Tesla GPU,您可以检查是否可以将其切换到 Tesla Compute Cluster 模式。在此模式下,不强制执行 TDR 限制。 AMD GPU 可能有类似的模式,但我不确定。
- 更改 TDR 值
这可以在 Windows 下通过将 HKEY_LOCAL_MACHINE -> SYSTEM -> CurrentControlSet -> Control -> GraphicsDrivers
下的 TdrDelay
和 TdrDdiDelay
注册表项编辑为更高的值来完成。请注意不要将数字设置得太高,否则您将无法知道驱动程序是否真的崩溃了。
另请注意,图形驱动程序或 Windows 更新可能会将这些值重置为默认值。
在 Linux 下,TDR 应该已经默认禁用(我知道它在 Ubuntu 18 和 Centos 8 下,但我没有在其他 versions/distros 上测试过),如果无论如何你都有问题你可以在你的Xorg
配置中添加Option Interactive "0"
,如this SO question
中所述
不幸的是,我不知道(也找不到)在 Mac 上执行此操作的方法OS,但是我知道如果您安装了辅助 GPU,则不会强制执行此限制在您的 MacOS 系统中。
- 将您的工作分成更小的部分
如果你能设法将你的计算分成更小的块,你可能无法超过 TDR 计时器(例如,2 个计算每个需要 4 秒而不是一个 8 秒),这的可行性取决于你的问题是,但可能是也可能不是一件容易的事。
[请看下面的编辑,问题的答案就在那里]
我正在尝试通过研究一个小型光线追踪器来学习 OpenCL(请参阅下面的代码,来自 link)。
我没有“真正的”GPU,我目前使用的是配备 Intel(R) Iris(TM) Graphics 6100 显卡的 macosx 笔记本电脑。
该代码在 CPU 上运行良好,但它在 GPU 上的行为很奇怪。它工作(或不工作)取决于每个像素的样本数(在场景中传播光线后通过像素射出以获得其颜色的光线数)。如果我取少量样本 (64),我可以获得 1280x720 的图片,但如果我取 128 个样本,我只能得到较小的图片。据我了解,样本数量不应该改变任何东西(当然除了图片的质量)。有没有我想念的纯粹与 OpenCL/GPU 相关的东西?
另外,好像是从GPU的内存中提取结果崩溃了:
queue.enqueueReadBuffer(cl_output, CL_TRUE, 0, image_width * image_height * sizeof(cl_float4), cpu_output);
我在这个阶段收到“中止陷阱:6”。
我遗漏了一些东西。
[编辑] 经过一番研究,我发现了一条有趣的线索:图形卡可能会自动中止任务,因为这需要太多时间。这种行为本来可以避免“冻结”屏幕。
你怎么看?
我找不到关闭此行为的方法。你知道怎么做吗?
文件如下:
main.cpp:
// OpenCL based simple sphere path tracer by Sam Lapere, 2016
// based on smallpt by Kevin Beason
// http://raytracey.blogspot.com
#include <iostream>
#include <fstream>
#include <vector>
#include <CL\cl.hpp>
using namespace std;
using namespace cl;
const int image_width = 1280;
const int image_height = 720;
cl_float4* cpu_output;
CommandQueue queue;
Device device;
Kernel kernel;
Context context;
Program program;
Buffer cl_output;
Buffer cl_spheres;
// dummy variables are required for memory alignment
// float3 is considered as float4 by OpenCL
struct Sphere
{
cl_float radius;
cl_float dummy1;
cl_float dummy2;
cl_float dummy3;
cl_float3 position;
cl_float3 color;
cl_float3 emission;
};
void pickPlatform(Platform& platform, const vector<Platform>& platforms){
if (platforms.size() == 1) platform = platforms[0];
else{
int input = 0;
cout << "\nChoose an OpenCL platform: ";
cin >> input;
// handle incorrect user input
while (input < 1 || input > platforms.size()){
cin.clear(); //clear errors/bad flags on cin
cin.ignore(cin.rdbuf()->in_avail(), '\n'); // ignores exact number of chars in cin buffer
cout << "No such option. Choose an OpenCL platform: ";
cin >> input;
}
platform = platforms[input - 1];
}
}
void pickDevice(Device& device, const vector<Device>& devices){
if (devices.size() == 1) device = devices[0];
else{
int input = 0;
cout << "\nChoose an OpenCL device: ";
cin >> input;
// handle incorrect user input
while (input < 1 || input > devices.size()){
cin.clear(); //clear errors/bad flags on cin
cin.ignore(cin.rdbuf()->in_avail(), '\n'); // ignores exact number of chars in cin buffer
cout << "No such option. Choose an OpenCL device: ";
cin >> input;
}
device = devices[input - 1];
}
}
void printErrorLog(const Program& program, const Device& device){
// Get the error log and print to console
string buildlog = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
cerr << "Build log:" << std::endl << buildlog << std::endl;
// Print the error log to a file
FILE *log = fopen("errorlog.txt", "w");
fprintf(log, "%s\n", buildlog);
cout << "Error log saved in 'errorlog.txt'" << endl;
system("PAUSE");
exit(1);
}
void initOpenCL()
{
// Get all available OpenCL platforms (e.g. AMD OpenCL, Nvidia CUDA, Intel OpenCL)
vector<Platform> platforms;
Platform::get(&platforms);
cout << "Available OpenCL platforms : " << endl << endl;
for (int i = 0; i < platforms.size(); i++)
cout << "\t" << i + 1 << ": " << platforms[i].getInfo<CL_PLATFORM_NAME>() << endl;
// Pick one platform
Platform platform;
pickPlatform(platform, platforms);
cout << "\nUsing OpenCL platform: \t" << platform.getInfo<CL_PLATFORM_NAME>() << endl;
// Get available OpenCL devices on platform
vector<Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
cout << "Available OpenCL devices on this platform: " << endl << endl;
for (int i = 0; i < devices.size(); i++){
cout << "\t" << i + 1 << ": " << devices[i].getInfo<CL_DEVICE_NAME>() << endl;
cout << "\t\tMax compute units: " << devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl;
cout << "\t\tMax work group size: " << devices[i].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl << endl;
}
// Pick one device
pickDevice(device, devices);
cout << "\nUsing OpenCL device: \t" << device.getInfo<CL_DEVICE_NAME>() << endl;
cout << "\t\t\tMax compute units: " << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << endl;
cout << "\t\t\tMax work group size: " << device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() << endl;
// Create an OpenCL context and command queue on that device.
context = Context(device);
queue = CommandQueue(context, device);
// Convert the OpenCL source code to a string
string source;
ifstream file("opencl_kernel.cl");
if (!file){
cout << "\nNo OpenCL file found!" << endl << "Exiting..." << endl;
system("PAUSE");
exit(1);
}
while (!file.eof()){
char line[256];
file.getline(line, 255);
source += line;
}
const char* kernel_source = source.c_str();
// Create an OpenCL program by performing runtime source compilation for the chosen device
program = Program(context, kernel_source);
cl_int result = program.build({ device });
if (result) cout << "Error during compilation OpenCL code!!!\n (" << result << ")" << endl;
if (result == CL_BUILD_PROGRAM_FAILURE) printErrorLog(program, device);
// Create a kernel (entry point in the OpenCL source program)
kernel = Kernel(program, "render_kernel");
}
void cleanUp(){
delete cpu_output;
}
inline float clamp(float x){ return x < 0.0f ? 0.0f : x > 1.0f ? 1.0f : x; }
// convert RGB float in range [0,1] to int in range [0, 255] and perform gamma correction
inline int toInt(float x){ return int(clamp(x) * 255 + .5); }
void saveImage(){
// write image to PPM file, a very simple image file format
// PPM files can be opened with IrfanView (download at www.irfanview.com) or GIMP
FILE *f = fopen("opencl_raytracer.ppm", "w");
fprintf(f, "P3\n%d %d\n%d\n", image_width, image_height, 255);
// loop over all pixels, write RGB values
for (int i = 0; i < image_width * image_height; i++)
fprintf(f, "%d %d %d ",
toInt(cpu_output[i].s[0]),
toInt(cpu_output[i].s[1]),
toInt(cpu_output[i].s[2]));
}
#define float3(x, y, z) {{x, y, z}} // macro to replace ugly initializer braces
void initScene(Sphere* cpu_spheres){
// left wall
cpu_spheres[0].radius = 200.0f;
cpu_spheres[0].position = float3(-200.6f, 0.0f, 0.0f);
cpu_spheres[0].color = float3(0.75f, 0.25f, 0.25f);
cpu_spheres[0].emission = float3(0.0f, 0.0f, 0.0f);
// right wall
cpu_spheres[1].radius = 200.0f;
cpu_spheres[1].position = float3(200.6f, 0.0f, 0.0f);
cpu_spheres[1].color = float3(0.25f, 0.25f, 0.75f);
cpu_spheres[1].emission = float3(0.0f, 0.0f, 0.0f);
// floor
cpu_spheres[2].radius = 200.0f;
cpu_spheres[2].position = float3(0.0f, -200.4f, 0.0f);
cpu_spheres[2].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[2].emission = float3(0.0f, 0.0f, 0.0f);
// ceiling
cpu_spheres[3].radius = 200.0f;
cpu_spheres[3].position = float3(0.0f, 200.4f, 0.0f);
cpu_spheres[3].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[3].emission = float3(0.0f, 0.0f, 0.0f);
// back wall
cpu_spheres[4].radius = 200.0f;
cpu_spheres[4].position = float3(0.0f, 0.0f, -200.4f);
cpu_spheres[4].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[4].emission = float3(0.0f, 0.0f, 0.0f);
// front wall
cpu_spheres[5].radius = 200.0f;
cpu_spheres[5].position = float3(0.0f, 0.0f, 202.0f);
cpu_spheres[5].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[5].emission = float3(0.0f, 0.0f, 0.0f);
// left sphere
cpu_spheres[6].radius = 0.16f;
cpu_spheres[6].position = float3(-0.25f, -0.24f, -0.1f);
cpu_spheres[6].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[6].emission = float3(0.0f, 0.0f, 0.0f);
// right sphere
cpu_spheres[7].radius = 0.16f;
cpu_spheres[7].position = float3(0.25f, -0.24f, 0.1f);
cpu_spheres[7].color = float3(0.9f, 0.8f, 0.7f);
cpu_spheres[7].emission = float3(0.0f, 0.0f, 0.0f);
// lightsource
cpu_spheres[8].radius = 1.0f;
cpu_spheres[8].position = float3(0.0f, 1.36f, 0.0f);
cpu_spheres[8].color = float3(0.0f, 0.0f, 0.0f);
cpu_spheres[8].emission = float3(9.0f, 8.0f, 6.0f);
}
void main(){
// initialise OpenCL
initOpenCL();
// allocate memory on CPU to hold the rendered image
cpu_output = new cl_float3[image_width * image_height];
// initialise scene
const int sphere_count = 9;
Sphere cpu_spheres[sphere_count];
initScene(cpu_spheres);
// Create buffers on the OpenCL device for the image and the scene
cl_output = Buffer(context, CL_MEM_WRITE_ONLY, image_width * image_height * sizeof(cl_float3));
cl_spheres = Buffer(context, CL_MEM_READ_ONLY, sphere_count * sizeof(Sphere));
queue.enqueueWriteBuffer(cl_spheres, CL_TRUE, 0, sphere_count * sizeof(Sphere), cpu_spheres);
// specify OpenCL kernel arguments
kernel.setArg(0, cl_spheres);
kernel.setArg(1, image_width);
kernel.setArg(2, image_height);
kernel.setArg(3, sphere_count);
kernel.setArg(4, cl_output);
// every pixel in the image has its own thread or "work item",
// so the total amount of work items equals the number of pixels
std::size_t global_work_size = image_width * image_height;
std::size_t local_work_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device);
cout << "Kernel work group size: " << local_work_size << endl;
// Ensure the global work size is a multiple of local work size
if (global_work_size % local_work_size != 0)
global_work_size = (global_work_size / local_work_size + 1) * local_work_size;
cout << "Rendering started..." << endl;
// launch the kernel
queue.enqueueNDRangeKernel(kernel, NULL, global_work_size, local_work_size);
queue.finish();
cout << "Rendering done! \nCopying output from device to host" << endl;
// read and copy OpenCL output to CPU
queue.enqueueReadBuffer(cl_output, CL_TRUE, 0, image_width * image_height * sizeof(cl_float3), cpu_output);
// save image
saveImage();
cout << "Saved image to 'opencl_raytracer.ppm'" << endl;
// release memory
cleanUp();
system("PAUSE");
}
opencl_kernel.cl:
/* OpenCL based simple sphere path tracer by Sam Lapere, 2016*/
/* based on smallpt by Kevin Beason */
/* http://raytracey.blogspot.com */
__constant float EPSILON = 0.00003f; /* required to compensate for limited float precision */
__constant float PI = 3.14159265359f;
__constant int SAMPLES = 128;
typedef struct Ray{
float3 origin;
float3 dir;
} Ray;
typedef struct Sphere{
float radius;
float3 pos;
float3 color;
float3 emission;
} Sphere;
static float get_random(unsigned int *seed0, unsigned int *seed1) {
/* hash the seeds using bitwise AND operations and bitshifts */
*seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
*seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
unsigned int ires = ((*seed0) << 16) + (*seed1);
/* use union struct to convert int to float */
union {
float f;
unsigned int ui;
} res;
res.ui = (ires & 0x007fffff) | 0x40000000; /* bitwise AND, bitwise OR */
return (res.f - 2.0f) / 2.0f;
}
Ray createCamRay(const int x_coord, const int y_coord, const int width, const int height){
float fx = (float)x_coord / (float)width; /* convert int in range [0 - width] to float in range [0-1] */
float fy = (float)y_coord / (float)height; /* convert int in range [0 - height] to float in range [0-1] */
/* calculate aspect ratio */
float aspect_ratio = (float)(width) / (float)(height);
float fx2 = (fx - 0.5f) * aspect_ratio;
float fy2 = fy - 0.5f;
/* determine position of pixel on screen */
float3 pixel_pos = (float3)(fx2, -fy2, 0.0f);
/* create camera ray*/
Ray ray;
ray.origin = (float3)(0.0f, 0.1f, 2.0f); /* fixed camera position */
ray.dir = normalize(pixel_pos - ray.origin); /* vector from camera to pixel on screen */
return ray;
}
/* (__global Sphere* sphere, const Ray* ray) */
float intersect_sphere(const Sphere* sphere, const Ray* ray) /* version using local copy of sphere */
{
float3 rayToCenter = sphere->pos - ray->origin;
float b = dot(rayToCenter, ray->dir);
float c = dot(rayToCenter, rayToCenter) - sphere->radius*sphere->radius;
float disc = b * b - c;
if (disc < 0.0f) return 0.0f;
else disc = sqrt(disc);
if ((b - disc) > EPSILON) return b - disc;
if ((b + disc) > EPSILON) return b + disc;
return 0.0f;
}
bool intersect_scene(__constant Sphere* spheres, const Ray* ray, float* t, int* sphere_id, const int sphere_count)
{
/* initialise t to a very large number,
so t will be guaranteed to be smaller
when a hit with the scene occurs */
float inf = 1e20f;
*t = inf;
/* check if the ray intersects each sphere in the scene */
for (int i = 0; i < sphere_count; i++) {
Sphere sphere = spheres[i]; /* create local copy of sphere */
/* float hitdistance = intersect_sphere(&spheres[i], ray); */
float hitdistance = intersect_sphere(&sphere, ray);
/* keep track of the closest intersection and hitobject found so far */
if (hitdistance != 0.0f && hitdistance < *t) {
*t = hitdistance;
*sphere_id = i;
}
}
return *t < inf; /* true when ray interesects the scene */
}
/* the path tracing function */
/* computes a path (starting from the camera) with a defined number of bounces, accumulates light/color at each bounce */
/* each ray hitting a surface will be reflected in a random direction (by randomly sampling the hemisphere above the hitpoint) */
/* small optimisation: diffuse ray directions are calculated using cosine weighted importance sampling */
float3 trace(__constant Sphere* spheres, const Ray* camray, const int sphere_count, const int* seed0, const int* seed1){
Ray ray = *camray;
float3 accum_color = (float3)(0.0f, 0.0f, 0.0f);
float3 mask = (float3)(1.0f, 1.0f, 1.0f);
for (int bounces = 0; bounces < 8; bounces++){
float t; /* distance to intersection */
int hitsphere_id = 0; /* index of intersected sphere */
/* if ray misses scene, return background colour */
if (!intersect_scene(spheres, &ray, &t, &hitsphere_id, sphere_count))
return accum_color += mask * (float3)(0.15f, 0.15f, 0.25f);
/* else, we've got a hit! Fetch the closest hit sphere */
Sphere hitsphere = spheres[hitsphere_id]; /* version with local copy of sphere */
/* compute the hitpoint using the ray equation */
float3 hitpoint = ray.origin + ray.dir * t;
/* compute the surface normal and flip it if necessary to face the incoming ray */
float3 normal = normalize(hitpoint - hitsphere.pos);
float3 normal_facing = dot(normal, ray.dir) < 0.0f ? normal : normal * (-1.0f);
/* compute two random numbers to pick a random point on the hemisphere above the hitpoint*/
float rand1 = 2.0f * PI * get_random(seed0, seed1);
float rand2 = get_random(seed0, seed1);
float rand2s = sqrt(rand2);
/* create a local orthogonal coordinate frame centered at the hitpoint */
float3 w = normal_facing;
float3 axis = fabs(w.x) > 0.1f ? (float3)(0.0f, 1.0f, 0.0f) : (float3)(1.0f, 0.0f, 0.0f);
float3 u = normalize(cross(axis, w));
float3 v = cross(w, u);
/* use the coordinte frame and random numbers to compute the next ray direction */
float3 newdir = normalize(u * cos(rand1)*rand2s + v*sin(rand1)*rand2s + w*sqrt(1.0f - rand2));
/* add a very small offset to the hitpoint to prevent self intersection */
ray.origin = hitpoint + normal_facing * EPSILON;
ray.dir = newdir;
/* add the colour and light contributions to the accumulated colour */
accum_color += mask * hitsphere.emission;
/* the mask colour picks up surface colours at each bounce */
mask *= hitsphere.color;
/* perform cosine-weighted importance sampling for diffuse surfaces*/
mask *= dot(newdir, normal_facing);
}
return accum_color;
}
__kernel void render_kernel(__constant Sphere* spheres, const int width, const int height, const int sphere_count, __global float3* output){
unsigned int work_item_id = get_global_id(0); /* the unique global id of the work item for the current pixel */
unsigned int x_coord = work_item_id % width; /* x-coordinate of the pixel */
unsigned int y_coord = work_item_id / width; /* y-coordinate of the pixel */
/* seeds for random number generator */
unsigned int seed0 = x_coord;
unsigned int seed1 = y_coord;
Ray camray = createCamRay(x_coord, y_coord, width, height);
/* add the light contribution of each sample and average over all samples*/
float3 finalcolor = (float3)(0.0f, 0.0f, 0.0f);
float invSamples = 1.0f / SAMPLES;
for (int i = 0; i < SAMPLES; i++)
finalcolor += trace(spheres, &camray, sphere_count, &seed0, &seed1) * invSamples;
/* store the pixelcolour in the output buffer */
output[work_item_id] = finalcolor;
}
由于您的程序在 CPU 上正常运行但在 GPU 上运行不正常,这可能意味着您超出了 GPU TDR(超时检测和恢复)计时器。
在 GPU 上进行计算时出现 Abort trap:6
错误的一个原因是将 GPU 锁定到计算模式的时间太长(一个常见的值似乎是 5 秒,但我发现这个数字与资源相矛盾) .出现这种情况看门狗会强行停止并重启显卡驱动,防止卡屏。
这个问题有几个可能的解决方案:
- 在无头机器上工作
大多数(如果不是全部)OS 如果没有附加屏幕则不会强制执行 TDR
- 切换 GPU 模式
如果您正在使用 Nvidia Tesla GPU,您可以检查是否可以将其切换到 Tesla Compute Cluster 模式。在此模式下,不强制执行 TDR 限制。 AMD GPU 可能有类似的模式,但我不确定。
- 更改 TDR 值
这可以在 Windows 下通过将 HKEY_LOCAL_MACHINE -> SYSTEM -> CurrentControlSet -> Control -> GraphicsDrivers
下的 TdrDelay
和 TdrDdiDelay
注册表项编辑为更高的值来完成。请注意不要将数字设置得太高,否则您将无法知道驱动程序是否真的崩溃了。
另请注意,图形驱动程序或 Windows 更新可能会将这些值重置为默认值。
在 Linux 下,TDR 应该已经默认禁用(我知道它在 Ubuntu 18 和 Centos 8 下,但我没有在其他 versions/distros 上测试过),如果无论如何你都有问题你可以在你的Xorg
配置中添加Option Interactive "0"
,如this SO question
不幸的是,我不知道(也找不到)在 Mac 上执行此操作的方法OS,但是我知道如果您安装了辅助 GPU,则不会强制执行此限制在您的 MacOS 系统中。
- 将您的工作分成更小的部分
如果你能设法将你的计算分成更小的块,你可能无法超过 TDR 计时器(例如,2 个计算每个需要 4 秒而不是一个 8 秒),这的可行性取决于你的问题是,但可能是也可能不是一件容易的事。