光线追踪和 CUDA
Ray-tracing and CUDA
我有一个光线追踪模型,我向具有约 100k 个三角形面的网格对象发射 20k 条光线。
为了计算交点坐标,我根据 Moller-trumbore 算法 (https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm) 编写了这个函数:
void MT_intersection(std::vector<double> origin, std::vector<double> dir, std::vector<double> v0, std::vector<double> v1, std::vector<double> v2, std::vector<double> &int_point) {
double eps = 0.0000001;
std::vector<double> E1(3);
std::vector<double> E2(3);
std::vector<double> s(3);
for (int i = 0; i < 3; i++) {
E1[i] = v1[i] - v0[i];
E2[i] = v2[i] - v0[i];
s[i] = origin[i] - v0[i];
}
std::vector<double> h(3);
h[0] = dir[1] * E2[2] - dir[2] * E2[1];
h[1] = -(dir[0] * E2[2] - dir[2] * E2[0]);
h[2] = dir[0] * E2[1] - dir[1] * E2[0];
double a;
a = E1[0] * h[0] + E1[1] * h[1] + E1[2] * h[2];
if (a > -eps && a < eps) {
int_point[0] = false;
}
else {
double f = 1 / a;
double u;
u = f * (s[0] * h[0] + s[1] * h[1] + s[2] * h[2]);
if (u < 0 || u > 1) {
int_point[0] = false;
}
else {
std::vector<double> q(3);
q[0] = s[1] * E1[2] - s[2] * E1[1];
q[1] = -(s[0] * E1[2] - s[2] * E1[0]);
q[2] = s[0] * E1[1] - s[1] * E1[0];
double v;
v = f * (dir[0] * q[0] + dir[1] * q[1] + dir[2] * q[2]);
if (v < 0 || (u + v)>1) {
int_point[0] = false;
}
else {
double t;
t = f * (E2[0] * q[0] + E2[1] * q[1] + E2[2] * q[2]);
if (t > eps) {
for (int i = 0; i < 3; i++) {
int_point[i] = origin[i] + dir[i] * t;
}
}
}
}
}
}
我将光线的原点和方向以及 3 个具有三角形顶点坐标 (v0,v1,v2) 的向量作为输入。
然后我在一个 for 循环中使用这个函数,在另一个 for 循环中重复 ~100k(三角形数),重复 20k(光线数)。
由于这段代码非常慢(计算所有内容大约需要 2 天半),我想 运行 它与 Cuda 并行,希望减少这个时间。
由于我正在使用 Python,我正在使用 PyCuda,并且我尝试用我的 "MT_intersection" 函数编写一个 C 内核:
import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
from stl import mesh
my_mesh = mesh.Mesh.from_file('sfera1.stl')
n = my_mesh.normals
v0 = my_mesh.v0
v1 = my_mesh.v1
v2 = my_mesh.v2
mod = SourceModule("""
#include <math.h>
//#include <vector>
__global__ void intersect(float *origin,float *dir,float *v0,float *v1,float *v2,float *int_point_real)
{
using namespace std;
//#include <vector>
//#include <math.h>
int idx = threadIdx.x;
//a[idx] *= 2;
int count = 0;
//std::vector<double> v0_current(3);
float v0_current[3];
float v1_current[3];
float v2_current[3];
float dir_current[3] = {dir[idx][0],dir[idx][1],dir[idx][2]};
//std::vector<double> v1_current(3);
//std::vector<double> v2_current(3);
float int_point[3];
//std::vector<float> int_point(3);
//std::vector<std::vector<float>> int_pointS;
float int_pointS[2][3];
//std::vector<std::vector<double>> int_point;
//std::vector<int> int_faces;
int int_faces[2];
float dist[2];
//std::vector<float> dist;
int n_tri = 960;
for(int i = 0; i<n_tri; i++) {
for (int j = 0; j<3; j++){
v0_current[j] = v0[i][j];
v1_current[j] = v1[i][j];
v2_current[j] = v2[i][j];
}
double eps = 0.0000001;
//std::vector<float> E1(3);
float E1[3];
//std::vector<float> E2(3);
float E2[3];
//std::vector<float> s(3);
float s[3];
for (int j = 0; j < 3; j++) {
E1[j] = v1_current[j] - v0_current[j];
E2[j] = v2_current[j] - v0_current[j];
s[j] = origin[j] - v0_current[j];
}
//std::vector<float> h(3);
float h[3];
h[0] = dir[1] * E2[2] - dir[2] * E2[1];
h[1] = -(dir[0] * E2[2] - dir[2] * E2[0]);
h[2] = dir[0] * E2[1] - dir[1] * E2[0];
float a;
a = E1[0] * h[0] + E1[1] * h[1] + E1[2] * h[2];
if (a > -eps && a < eps) {
int_point[0] = false;
//return false;
}
else {
double f = 1 / a;
float u;
u = f * (s[0] * h[0] + s[1] * h[1] + s[2] * h[2]);
if (u < 0 || u > 1) {
int_point[0] = false;
//return false;
}
else {
//std::vector<float> q(3);
float q[3];
q[0] = s[1] * E1[2] - s[2] * E1[1];
q[1] = -(s[0] * E1[2] - s[2] * E1[0]);
q[2] = s[0] * E1[1] - s[1] * E1[0];
float v;
v = f * (dir[0] * q[0] + dir[1] * q[1] + dir[2] * q[2]);
if (v < 0 || (u + v)>1) {
int_point[0] = false;
//return false;
}
else {
float t;
t = f * (E2[0] * q[0] + E2[1] * q[1] + E2[2] * q[2]);
if (t > eps) {
for (int j = 0; j < 3; j++) {
int_point[j] = origin[j] + dir_current[j] * t;
}
//return t;
}
}
}
}
if (int_point[0] != false) {
count = count+1;
//int_faces.push_back(i);
int_faces[count-1] = i;
//dist.push_back(sqrt(pow((origin[0] - int_point[0]), 2) + pow((origin[1] - int_point[1]), 2) + pow((origin[2] - int_point[2]), 2)));
//dist.push_back(x);
dist[count-1] = sqrt(pow((origin[0] - int_point[0]), 2) + pow((origin[1] - int_point[1]), 2) + pow((origin[2] - int_point[2]), 2));
//int_pointS.push_back(int_point);
for (int j = 0; j<3; j++) {
int_pointS[count-1][j] = int_point[j];
}
}
}
double min = dist[0];
int ind_min = 0;
for (int i = 0; i < int_pointS.size(); i++){
if (min > dist[i]) {
min = dist[i];
ind_min = i;
}
}
//dist_real[Idx] = dist[ind_min];
//int_point_real_x[Idx] = int_pointS[ind_min][0];
//int_point_real_y[Idx] = int_pointS[ind_min][1];
//int_point_real_z[Idx] = int_pointS[ind_min][2];
int_point_real[Idx][0] = int_pointS[ind_min][0];
int_point_real[Idx][1] = int_pointS[ind_min][1];
int_point_real[Idx][2] = int_pointS[ind_min][2];
}
""")
origin = np.asarray([1, 1, 1]).astype(np.float32)
direction = np.ones((100, 3)).astype(np.float32)
int_point_real = np.zeros((100, 3)).astype(np.float32)
intersect = mod.get_function("intersect")
intersect(drv.In(origin), drv.In(direction), drv.In(v0), drv.In(v1), drv.In(v2), drv.Out(int_point_real), block=(512,1,1), grid=(64,1,1))
我的想法是运行 20k 条光线平行。
这个 Python 脚本给我不同的错误:
kernel.cu(18): error: expression must have pointer-to-object type
kernel.cu(18): error: expression must have pointer-to-object type
kernel.cu(18): error: expression must have pointer-to-object type
kernel.cu(34): error: expression must have pointer-to-object type
kernel.cu(35): error: expression must have pointer-to-object type
kernel.cu(36): error: expression must have pointer-to-object type
kernel.cu(108): error: expression must have class type
kernel.cu(118): error: expression must have pointer-to-object type
kernel.cu(119): error: expression must have pointer-to-object type
kernel.cu(120): error: expression must have pointer-to-object type
kernel.cu(27): warning: variable "int_faces" was set but never used
10 errors detected in the compilation of
"C:/Users/20180781/AppData/Local/Temp/tmpxft_00000d44_00000000-10_kernel.cpp1.ii".
]
知道为什么吗?
当我有很多光线和很多面时,有谁知道计算交点的更智能、更有效的方法吗?
似乎所有错误都是针对您尝试双重索引的行报告的。行号有点偏离,但从警告 kernel.cu(27): warning: variable "int_faces" was set but never used
可以推断出前几条错误消息指的是以下几行:
float dir_current[3] = {dir[idx][0],dir[idx][1],dir[idx][2]};
[...]
v0_current[j] = v0[i][j];
v1_current[j] = v1[i][j];
v2_current[j] = v2[i][j];
也是有道理的,因为dir
、v0
、v1
、v2
都定义为float *
。这只是一个浮点指针。您可以像 dir[idx]
那样对其进行索引一次,生成一个浮点数,但是再次对其进行索引(仅仅是一个浮点数)是没有意义的,例如 dir[idx][0]
.
在这一点之后,行号再次关闭,但接受我的假设,可以假设这些是最后 3 个有问题的行:
int_point_real[Idx][0] = int_pointS[ind_min][0];
int_point_real[Idx][1] = int_pointS[ind_min][1];
int_point_real[Idx][2] = int_pointS[ind_min][2];
事实上,int_point_real 也只是一个指向浮点数的指针。
另外,请注意尽管 int_pointS
在其他几行中被引用,但没有为它们报告错误,因为 that 变量被正确声明为二维数组(可以被索引两次)。
我有一个光线追踪模型,我向具有约 100k 个三角形面的网格对象发射 20k 条光线。
为了计算交点坐标,我根据 Moller-trumbore 算法 (https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm) 编写了这个函数:
void MT_intersection(std::vector<double> origin, std::vector<double> dir, std::vector<double> v0, std::vector<double> v1, std::vector<double> v2, std::vector<double> &int_point) {
double eps = 0.0000001;
std::vector<double> E1(3);
std::vector<double> E2(3);
std::vector<double> s(3);
for (int i = 0; i < 3; i++) {
E1[i] = v1[i] - v0[i];
E2[i] = v2[i] - v0[i];
s[i] = origin[i] - v0[i];
}
std::vector<double> h(3);
h[0] = dir[1] * E2[2] - dir[2] * E2[1];
h[1] = -(dir[0] * E2[2] - dir[2] * E2[0]);
h[2] = dir[0] * E2[1] - dir[1] * E2[0];
double a;
a = E1[0] * h[0] + E1[1] * h[1] + E1[2] * h[2];
if (a > -eps && a < eps) {
int_point[0] = false;
}
else {
double f = 1 / a;
double u;
u = f * (s[0] * h[0] + s[1] * h[1] + s[2] * h[2]);
if (u < 0 || u > 1) {
int_point[0] = false;
}
else {
std::vector<double> q(3);
q[0] = s[1] * E1[2] - s[2] * E1[1];
q[1] = -(s[0] * E1[2] - s[2] * E1[0]);
q[2] = s[0] * E1[1] - s[1] * E1[0];
double v;
v = f * (dir[0] * q[0] + dir[1] * q[1] + dir[2] * q[2]);
if (v < 0 || (u + v)>1) {
int_point[0] = false;
}
else {
double t;
t = f * (E2[0] * q[0] + E2[1] * q[1] + E2[2] * q[2]);
if (t > eps) {
for (int i = 0; i < 3; i++) {
int_point[i] = origin[i] + dir[i] * t;
}
}
}
}
}
}
我将光线的原点和方向以及 3 个具有三角形顶点坐标 (v0,v1,v2) 的向量作为输入。
然后我在一个 for 循环中使用这个函数,在另一个 for 循环中重复 ~100k(三角形数),重复 20k(光线数)。
由于这段代码非常慢(计算所有内容大约需要 2 天半),我想 运行 它与 Cuda 并行,希望减少这个时间。 由于我正在使用 Python,我正在使用 PyCuda,并且我尝试用我的 "MT_intersection" 函数编写一个 C 内核:
import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
from stl import mesh
my_mesh = mesh.Mesh.from_file('sfera1.stl')
n = my_mesh.normals
v0 = my_mesh.v0
v1 = my_mesh.v1
v2 = my_mesh.v2
mod = SourceModule("""
#include <math.h>
//#include <vector>
__global__ void intersect(float *origin,float *dir,float *v0,float *v1,float *v2,float *int_point_real)
{
using namespace std;
//#include <vector>
//#include <math.h>
int idx = threadIdx.x;
//a[idx] *= 2;
int count = 0;
//std::vector<double> v0_current(3);
float v0_current[3];
float v1_current[3];
float v2_current[3];
float dir_current[3] = {dir[idx][0],dir[idx][1],dir[idx][2]};
//std::vector<double> v1_current(3);
//std::vector<double> v2_current(3);
float int_point[3];
//std::vector<float> int_point(3);
//std::vector<std::vector<float>> int_pointS;
float int_pointS[2][3];
//std::vector<std::vector<double>> int_point;
//std::vector<int> int_faces;
int int_faces[2];
float dist[2];
//std::vector<float> dist;
int n_tri = 960;
for(int i = 0; i<n_tri; i++) {
for (int j = 0; j<3; j++){
v0_current[j] = v0[i][j];
v1_current[j] = v1[i][j];
v2_current[j] = v2[i][j];
}
double eps = 0.0000001;
//std::vector<float> E1(3);
float E1[3];
//std::vector<float> E2(3);
float E2[3];
//std::vector<float> s(3);
float s[3];
for (int j = 0; j < 3; j++) {
E1[j] = v1_current[j] - v0_current[j];
E2[j] = v2_current[j] - v0_current[j];
s[j] = origin[j] - v0_current[j];
}
//std::vector<float> h(3);
float h[3];
h[0] = dir[1] * E2[2] - dir[2] * E2[1];
h[1] = -(dir[0] * E2[2] - dir[2] * E2[0]);
h[2] = dir[0] * E2[1] - dir[1] * E2[0];
float a;
a = E1[0] * h[0] + E1[1] * h[1] + E1[2] * h[2];
if (a > -eps && a < eps) {
int_point[0] = false;
//return false;
}
else {
double f = 1 / a;
float u;
u = f * (s[0] * h[0] + s[1] * h[1] + s[2] * h[2]);
if (u < 0 || u > 1) {
int_point[0] = false;
//return false;
}
else {
//std::vector<float> q(3);
float q[3];
q[0] = s[1] * E1[2] - s[2] * E1[1];
q[1] = -(s[0] * E1[2] - s[2] * E1[0]);
q[2] = s[0] * E1[1] - s[1] * E1[0];
float v;
v = f * (dir[0] * q[0] + dir[1] * q[1] + dir[2] * q[2]);
if (v < 0 || (u + v)>1) {
int_point[0] = false;
//return false;
}
else {
float t;
t = f * (E2[0] * q[0] + E2[1] * q[1] + E2[2] * q[2]);
if (t > eps) {
for (int j = 0; j < 3; j++) {
int_point[j] = origin[j] + dir_current[j] * t;
}
//return t;
}
}
}
}
if (int_point[0] != false) {
count = count+1;
//int_faces.push_back(i);
int_faces[count-1] = i;
//dist.push_back(sqrt(pow((origin[0] - int_point[0]), 2) + pow((origin[1] - int_point[1]), 2) + pow((origin[2] - int_point[2]), 2)));
//dist.push_back(x);
dist[count-1] = sqrt(pow((origin[0] - int_point[0]), 2) + pow((origin[1] - int_point[1]), 2) + pow((origin[2] - int_point[2]), 2));
//int_pointS.push_back(int_point);
for (int j = 0; j<3; j++) {
int_pointS[count-1][j] = int_point[j];
}
}
}
double min = dist[0];
int ind_min = 0;
for (int i = 0; i < int_pointS.size(); i++){
if (min > dist[i]) {
min = dist[i];
ind_min = i;
}
}
//dist_real[Idx] = dist[ind_min];
//int_point_real_x[Idx] = int_pointS[ind_min][0];
//int_point_real_y[Idx] = int_pointS[ind_min][1];
//int_point_real_z[Idx] = int_pointS[ind_min][2];
int_point_real[Idx][0] = int_pointS[ind_min][0];
int_point_real[Idx][1] = int_pointS[ind_min][1];
int_point_real[Idx][2] = int_pointS[ind_min][2];
}
""")
origin = np.asarray([1, 1, 1]).astype(np.float32)
direction = np.ones((100, 3)).astype(np.float32)
int_point_real = np.zeros((100, 3)).astype(np.float32)
intersect = mod.get_function("intersect")
intersect(drv.In(origin), drv.In(direction), drv.In(v0), drv.In(v1), drv.In(v2), drv.Out(int_point_real), block=(512,1,1), grid=(64,1,1))
我的想法是运行 20k 条光线平行。 这个 Python 脚本给我不同的错误:
kernel.cu(18): error: expression must have pointer-to-object type
kernel.cu(18): error: expression must have pointer-to-object type
kernel.cu(18): error: expression must have pointer-to-object type
kernel.cu(34): error: expression must have pointer-to-object type
kernel.cu(35): error: expression must have pointer-to-object type
kernel.cu(36): error: expression must have pointer-to-object type
kernel.cu(108): error: expression must have class type
kernel.cu(118): error: expression must have pointer-to-object type
kernel.cu(119): error: expression must have pointer-to-object type
kernel.cu(120): error: expression must have pointer-to-object type
kernel.cu(27): warning: variable "int_faces" was set but never used
10 errors detected in the compilation of "C:/Users/20180781/AppData/Local/Temp/tmpxft_00000d44_00000000-10_kernel.cpp1.ii". ]
知道为什么吗?
当我有很多光线和很多面时,有谁知道计算交点的更智能、更有效的方法吗?
似乎所有错误都是针对您尝试双重索引的行报告的。行号有点偏离,但从警告 kernel.cu(27): warning: variable "int_faces" was set but never used
可以推断出前几条错误消息指的是以下几行:
float dir_current[3] = {dir[idx][0],dir[idx][1],dir[idx][2]};
[...]
v0_current[j] = v0[i][j];
v1_current[j] = v1[i][j];
v2_current[j] = v2[i][j];
也是有道理的,因为dir
、v0
、v1
、v2
都定义为float *
。这只是一个浮点指针。您可以像 dir[idx]
那样对其进行索引一次,生成一个浮点数,但是再次对其进行索引(仅仅是一个浮点数)是没有意义的,例如 dir[idx][0]
.
在这一点之后,行号再次关闭,但接受我的假设,可以假设这些是最后 3 个有问题的行:
int_point_real[Idx][0] = int_pointS[ind_min][0];
int_point_real[Idx][1] = int_pointS[ind_min][1];
int_point_real[Idx][2] = int_pointS[ind_min][2];
事实上,int_point_real 也只是一个指向浮点数的指针。
另外,请注意尽管 int_pointS
在其他几行中被引用,但没有为它们报告错误,因为 that 变量被正确声明为二维数组(可以被索引两次)。