正确创建 Optix 7.1 TLAS 实例加速结构
Proper creation of Optix 7.1 TLAS Instance Acceleration Structure
我正在尝试弄清楚如何正确构建 TLAS。
使用 OptiX 7.1 和 Ingo Wald's Optix 7 samples 附带的示例,从一个三角形开始(只是保存几何图形的 BLAS)并且工作正常(将 SDK 的三角形示例移至 Wald 的示例框架)。
接下来,我引入了一个带有一个实例的 TLAS(即之前的 BLAS),并在着色器中使用了该 TLAS,但我没有获得任何成功。
我做错了什么?
OptixTraversableHandle SampleRenderer::buildAccelerator() {
OptixTraversableHandle geometryAcceleratorHandle{ 0 };
CUdeviceptr dAcceleratorBuffer;
OptixAccelBuildOptions acceleratorOptions{};
acceleratorOptions.buildFlags = OPTIX_BUILD_FLAG_NONE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
acceleratorOptions.operation = OPTIX_BUILD_OPERATION_BUILD;
//Triangle build input: simple list of three vertices
const std::array<float3, 3> vertices{ { { 0.33f, 0.33f, 0.0f },{ 0.33f, -0.33f, 0.0f },{ 0.66f, 0.33f, 0.0f }} };
const size_t verticesSize = sizeof(float3) * vertices.size();
CUdeviceptr dVertices{ 0ull };
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dVertices), verticesSize));
CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dVertices), vertices.data(), verticesSize, cudaMemcpyHostToDevice));
//Build input is a simple list of non-indexed triangle vertices
const uint32_t triangleInputFlags{ OPTIX_GEOMETRY_FLAG_NONE };
OptixBuildInput triangleInput{};
triangleInput.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
triangleInput.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
triangleInput.triangleArray.numVertices = static_cast<uint32_t>(vertices.size());
triangleInput.triangleArray.vertexBuffers = &dVertices;
triangleInput.triangleArray.flags = &triangleInputFlags;
triangleInput.triangleArray.numSbtRecords = 1u;
OptixAccelBufferSizes blasBufferSizes;
OPTIX_CHECK(optixAccelComputeMemoryUsage(optixContext, &acceleratorOptions, &triangleInput, 1, &blasBufferSizes));
CUdeviceptr dTempBuffer;
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dTempBuffer), blasBufferSizes.tempSizeInBytes));
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dAcceleratorBuffer), blasBufferSizes.outputSizeInBytes));
OPTIX_CHECK(
optixAccelBuild(
optixContext,
0,
&acceleratorOptions,
&triangleInput,
1,
dTempBuffer,
blasBufferSizes.tempSizeInBytes,
dAcceleratorBuffer,
blasBufferSizes.outputSizeInBytes,
&geometryAcceleratorHandle,
nullptr,
0)
);
CUDA_CHECK(Free((void*)dTempBuffer));
CUDA_CHECK(Free((void*)dVertices));
return geometryAcceleratorHandle;
}
我没有使用上面函数的 return 值,而是将它提供给下面的 TLAS 创建函数,并在着色器中使用它的输出句柄:
OptixTraversableHandle SampleRenderer::buildInstanceAccelerator(const OptixTraversableHandle& geoHandle){
OptixInstance optixInstance = { { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f } };
optixInstance.flags = OPTIX_INSTANCE_FLAG_NONE;
optixInstance.instanceId = 0u;
optixInstance.sbtOffset = 0u;
optixInstance.visibilityMask = 1u;
optixInstance.traversableHandle = geoHandle;
CUdeviceptr dOptixInstance;
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dOptixInstance), sizeof(OptixInstance)));
CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dOptixInstance), &optixInstance, sizeof(OptixInstance), cudaMemcpyHostToDevice));
OptixAabb optixAabb[2]{
{ -1.5f, -1.0f, -0.5f,
-0.5f, 0.5f, 0.5f },
{ 0.5f, 0.0f, -0.01f,
1.5f, 1.5f, 0.01f } };
CUdeviceptr dAabb;
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dAabb), 2 * sizeof(OptixAabb)));
CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dAabb), optixAabb, 2 * sizeof(OptixAabb), cudaMemcpyHostToDevice));
OptixBuildInput instanceBuildInput{};
instanceBuildInput.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
instanceBuildInput.instanceArray.instances = dOptixInstance;
instanceBuildInput.instanceArray.numInstances = 1u;
instanceBuildInput.instanceArray.aabbs = dAabb;
instanceBuildInput.instanceArray.numAabbs =1u;
OptixAccelBuildOptions acceleratorBuildOptions{};
acceleratorBuildOptions.buildFlags = OPTIX_BUILD_FLAG_NONE;
acceleratorBuildOptions.operation = OPTIX_BUILD_OPERATION_BUILD;
OptixAccelBufferSizes acceleratorBufferSizes;
OPTIX_CHECK(optixAccelComputeMemoryUsage(
optixContext,
&acceleratorBuildOptions,
&instanceBuildInput,
1u,
&acceleratorBufferSizes));
CUdeviceptr dTempBuffer;
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dTempBuffer), acceleratorBufferSizes.tempSizeInBytes));
CUdeviceptr dInstanceAcceleratorBuffer;
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dInstanceAcceleratorBuffer), acceleratorBufferSizes.outputSizeInBytes));
OptixTraversableHandle instanceAcceleratorHandle{ 0 };
OPTIX_CHECK(optixAccelBuild(
optixContext,
0,
&acceleratorBuildOptions,
&instanceBuildInput,
1,
dTempBuffer,
acceleratorBufferSizes.tempSizeInBytes,
dInstanceAcceleratorBuffer,
acceleratorBufferSizes.outputSizeInBytes,
&instanceAcceleratorHandle,
nullptr,
0));
return instanceAcceleratorHandle;
}
参考着色器代码(当 OptixTraversableHandle 来自上面的第一个函数时它工作得很好:
namespace osc {
extern "C" __constant__ LaunchParams optixLaunchParams;
//Single ray type
enum { SURFACE_RAY_TYPE = 0, RAY_TYPE_COUNT };
static __forceinline__ __device__ void* unpackPointer(uint32_t i0, uint32_t i1) {
const uint64_t uptr = static_cast<uint64_t>(i0) << 32 | i1;
void* ptr = reinterpret_cast<void*>(uptr);
return ptr;
}
static __forceinline__ __device__ void packPointer(void* ptr, uint32_t& i0, uint32_t& i1) {
const uint64_t uptr = reinterpret_cast<uint64_t>(ptr);
i0 = uptr >> 32;
i1 = uptr & 0x00000000ffffffff;
}
template<typename T> static __forceinline__ __device__ T* getPRD() {
const uint32_t u0 = optixGetPayload_0();
const uint32_t u1 = optixGetPayload_1();
return reinterpret_cast<T*>(unpackPointer(u0, u1));
}
static __forceinline__ __device__ void trace(
OptixTraversableHandle handle,
vec3f ray_origin,
vec3f ray_direction,
float tmin,
float tmax,
float3* prd) {
unsigned int p0, p1, p2;
p0 = float_as_int(prd->x);
p1 = float_as_int(prd->y);
p2 = float_as_int(prd->z);
optixTrace(
handle,
ray_origin,
ray_direction,
tmin,
tmax,
0.0f, // rayTime
OptixVisibilityMask(1),
OPTIX_RAY_FLAG_DISABLE_ANYHIT,//OPTIX_RAY_FLAG_NONE,
0, // SBT offset
0, // SBT stride
0, // missSBTIndex
p0,
p1,
p2);
prd->x = int_as_float(p0);
prd->y = int_as_float(p1);
prd->z = int_as_float(p2);
}
static __forceinline__ __device__ void setPayload(float3 p) {
optixSetPayload_0(float_as_int(p.x));
optixSetPayload_1(float_as_int(p.y));
optixSetPayload_2(float_as_int(p.z));
}
static __forceinline__ __device__ float3 getPayload() {
return make_float3(int_as_float(optixGetPayload_0()), int_as_float(optixGetPayload_1()), int_as_float(optixGetPayload_2()));
}
extern "C" __global__ void __closesthit__radiance() {
//When built-in triangle intersection is used, a number of fundamental
//attributes are provided by the OptiX API, including barycentric coordinates
const float2 barycentricCoordinates = optixGetTriangleBarycentrics();
setPayload(make_float3(barycentricCoordinates.x, barycentricCoordinates.y, 1.f - barycentricCoordinates.x - barycentricCoordinates.y));
}
extern "C" __global__ void __anyhit__radiance() { }
extern "C" __global__ void __intersection__radiance() { }
extern "C" __global__ void __miss__radiance() {
MissData* missData = reinterpret_cast<MissData*>(optixGetSbtDataPointer());
float3 payload = getPayload();//Why???
setPayload(missData->backgroundColor);
}
extern "C" __global__ void __raygen__renderFrame() {
// compute a test pattern based on pixel ID
const int ix = optixGetLaunchIndex().x;
const int iy = optixGetLaunchIndex().y;
const auto& camera = optixLaunchParams.camera;
// our per-ray data for this example. what we initialize it to
// won't matter, since this value will be overwritten by either
// the miss or hit program, anyway
float3 pixelColorPRD = { 0.5f, 0.5f, 0.5f };
// normalized screen plane position, in [0,1]^2
const vec2f screen(vec2f(ix + .5f, iy + .5f) / vec2f(optixLaunchParams.frame.size));
// generate ray direction
vec3f rayDir = normalize(camera.direction + (screen.x - 0.5f) * camera.horizontal + (screen.y - 0.5f) * camera.vertical);
trace(optixLaunchParams.traversable,
camera.position,
rayDir,
0.f, // tmin
1e16f, // tmax
&pixelColorPRD);
const int r = int(255.99f * pixelColorPRD.x);
const int g = int(255.99f * pixelColorPRD.y);
const int b = int(255.99f * pixelColorPRD.z);
const uint32_t rgba = 0xff000000 | (r << 0) | (g << 8) | (b << 16);
const uint32_t fbIndex = ix + iy * optixLaunchParams.frame.size.x;
optixLaunchParams.frame.colorBuffer[fbIndex] = rgba;
}
}
上面的代码看起来很合理,所以首先猜测是检查创建上下文和管道的标志;特别是管道创建标志的实例级别标志:您最初从中复制和粘贴的示例没有使用实例化,因此几乎可以肯定其管道设置为 'no instancing'(因为如果它 实例化打开它会 期望 看到一个实例!)。
然而,如果您正在创建的 TLASes/BLASes 的实际实例级别与您用于创建管道的级别不匹配,您将找不到任何命中。
我还建议查看我最近的 OWL 项目中的一些代码,也在 github 上——它也执行实例化、多级实例、etcpp。
- OptixPipelineCompileOptions traversableGraphFlags 必须包含
旗帜 OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING
- optixPipelineSetStackSize() 中的 maxTraversableGraphDepth = 2
解决了问题。
我正在尝试弄清楚如何正确构建 TLAS。 使用 OptiX 7.1 和 Ingo Wald's Optix 7 samples 附带的示例,从一个三角形开始(只是保存几何图形的 BLAS)并且工作正常(将 SDK 的三角形示例移至 Wald 的示例框架)。 接下来,我引入了一个带有一个实例的 TLAS(即之前的 BLAS),并在着色器中使用了该 TLAS,但我没有获得任何成功。 我做错了什么?
OptixTraversableHandle SampleRenderer::buildAccelerator() {
OptixTraversableHandle geometryAcceleratorHandle{ 0 };
CUdeviceptr dAcceleratorBuffer;
OptixAccelBuildOptions acceleratorOptions{};
acceleratorOptions.buildFlags = OPTIX_BUILD_FLAG_NONE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
acceleratorOptions.operation = OPTIX_BUILD_OPERATION_BUILD;
//Triangle build input: simple list of three vertices
const std::array<float3, 3> vertices{ { { 0.33f, 0.33f, 0.0f },{ 0.33f, -0.33f, 0.0f },{ 0.66f, 0.33f, 0.0f }} };
const size_t verticesSize = sizeof(float3) * vertices.size();
CUdeviceptr dVertices{ 0ull };
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dVertices), verticesSize));
CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dVertices), vertices.data(), verticesSize, cudaMemcpyHostToDevice));
//Build input is a simple list of non-indexed triangle vertices
const uint32_t triangleInputFlags{ OPTIX_GEOMETRY_FLAG_NONE };
OptixBuildInput triangleInput{};
triangleInput.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
triangleInput.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
triangleInput.triangleArray.numVertices = static_cast<uint32_t>(vertices.size());
triangleInput.triangleArray.vertexBuffers = &dVertices;
triangleInput.triangleArray.flags = &triangleInputFlags;
triangleInput.triangleArray.numSbtRecords = 1u;
OptixAccelBufferSizes blasBufferSizes;
OPTIX_CHECK(optixAccelComputeMemoryUsage(optixContext, &acceleratorOptions, &triangleInput, 1, &blasBufferSizes));
CUdeviceptr dTempBuffer;
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dTempBuffer), blasBufferSizes.tempSizeInBytes));
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dAcceleratorBuffer), blasBufferSizes.outputSizeInBytes));
OPTIX_CHECK(
optixAccelBuild(
optixContext,
0,
&acceleratorOptions,
&triangleInput,
1,
dTempBuffer,
blasBufferSizes.tempSizeInBytes,
dAcceleratorBuffer,
blasBufferSizes.outputSizeInBytes,
&geometryAcceleratorHandle,
nullptr,
0)
);
CUDA_CHECK(Free((void*)dTempBuffer));
CUDA_CHECK(Free((void*)dVertices));
return geometryAcceleratorHandle;
}
我没有使用上面函数的 return 值,而是将它提供给下面的 TLAS 创建函数,并在着色器中使用它的输出句柄:
OptixTraversableHandle SampleRenderer::buildInstanceAccelerator(const OptixTraversableHandle& geoHandle){
OptixInstance optixInstance = { { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f } };
optixInstance.flags = OPTIX_INSTANCE_FLAG_NONE;
optixInstance.instanceId = 0u;
optixInstance.sbtOffset = 0u;
optixInstance.visibilityMask = 1u;
optixInstance.traversableHandle = geoHandle;
CUdeviceptr dOptixInstance;
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dOptixInstance), sizeof(OptixInstance)));
CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dOptixInstance), &optixInstance, sizeof(OptixInstance), cudaMemcpyHostToDevice));
OptixAabb optixAabb[2]{
{ -1.5f, -1.0f, -0.5f,
-0.5f, 0.5f, 0.5f },
{ 0.5f, 0.0f, -0.01f,
1.5f, 1.5f, 0.01f } };
CUdeviceptr dAabb;
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dAabb), 2 * sizeof(OptixAabb)));
CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dAabb), optixAabb, 2 * sizeof(OptixAabb), cudaMemcpyHostToDevice));
OptixBuildInput instanceBuildInput{};
instanceBuildInput.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
instanceBuildInput.instanceArray.instances = dOptixInstance;
instanceBuildInput.instanceArray.numInstances = 1u;
instanceBuildInput.instanceArray.aabbs = dAabb;
instanceBuildInput.instanceArray.numAabbs =1u;
OptixAccelBuildOptions acceleratorBuildOptions{};
acceleratorBuildOptions.buildFlags = OPTIX_BUILD_FLAG_NONE;
acceleratorBuildOptions.operation = OPTIX_BUILD_OPERATION_BUILD;
OptixAccelBufferSizes acceleratorBufferSizes;
OPTIX_CHECK(optixAccelComputeMemoryUsage(
optixContext,
&acceleratorBuildOptions,
&instanceBuildInput,
1u,
&acceleratorBufferSizes));
CUdeviceptr dTempBuffer;
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dTempBuffer), acceleratorBufferSizes.tempSizeInBytes));
CUdeviceptr dInstanceAcceleratorBuffer;
CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dInstanceAcceleratorBuffer), acceleratorBufferSizes.outputSizeInBytes));
OptixTraversableHandle instanceAcceleratorHandle{ 0 };
OPTIX_CHECK(optixAccelBuild(
optixContext,
0,
&acceleratorBuildOptions,
&instanceBuildInput,
1,
dTempBuffer,
acceleratorBufferSizes.tempSizeInBytes,
dInstanceAcceleratorBuffer,
acceleratorBufferSizes.outputSizeInBytes,
&instanceAcceleratorHandle,
nullptr,
0));
return instanceAcceleratorHandle;
}
参考着色器代码(当 OptixTraversableHandle 来自上面的第一个函数时它工作得很好:
namespace osc {
extern "C" __constant__ LaunchParams optixLaunchParams;
//Single ray type
enum { SURFACE_RAY_TYPE = 0, RAY_TYPE_COUNT };
static __forceinline__ __device__ void* unpackPointer(uint32_t i0, uint32_t i1) {
const uint64_t uptr = static_cast<uint64_t>(i0) << 32 | i1;
void* ptr = reinterpret_cast<void*>(uptr);
return ptr;
}
static __forceinline__ __device__ void packPointer(void* ptr, uint32_t& i0, uint32_t& i1) {
const uint64_t uptr = reinterpret_cast<uint64_t>(ptr);
i0 = uptr >> 32;
i1 = uptr & 0x00000000ffffffff;
}
template<typename T> static __forceinline__ __device__ T* getPRD() {
const uint32_t u0 = optixGetPayload_0();
const uint32_t u1 = optixGetPayload_1();
return reinterpret_cast<T*>(unpackPointer(u0, u1));
}
static __forceinline__ __device__ void trace(
OptixTraversableHandle handle,
vec3f ray_origin,
vec3f ray_direction,
float tmin,
float tmax,
float3* prd) {
unsigned int p0, p1, p2;
p0 = float_as_int(prd->x);
p1 = float_as_int(prd->y);
p2 = float_as_int(prd->z);
optixTrace(
handle,
ray_origin,
ray_direction,
tmin,
tmax,
0.0f, // rayTime
OptixVisibilityMask(1),
OPTIX_RAY_FLAG_DISABLE_ANYHIT,//OPTIX_RAY_FLAG_NONE,
0, // SBT offset
0, // SBT stride
0, // missSBTIndex
p0,
p1,
p2);
prd->x = int_as_float(p0);
prd->y = int_as_float(p1);
prd->z = int_as_float(p2);
}
static __forceinline__ __device__ void setPayload(float3 p) {
optixSetPayload_0(float_as_int(p.x));
optixSetPayload_1(float_as_int(p.y));
optixSetPayload_2(float_as_int(p.z));
}
static __forceinline__ __device__ float3 getPayload() {
return make_float3(int_as_float(optixGetPayload_0()), int_as_float(optixGetPayload_1()), int_as_float(optixGetPayload_2()));
}
extern "C" __global__ void __closesthit__radiance() {
//When built-in triangle intersection is used, a number of fundamental
//attributes are provided by the OptiX API, including barycentric coordinates
const float2 barycentricCoordinates = optixGetTriangleBarycentrics();
setPayload(make_float3(barycentricCoordinates.x, barycentricCoordinates.y, 1.f - barycentricCoordinates.x - barycentricCoordinates.y));
}
extern "C" __global__ void __anyhit__radiance() { }
extern "C" __global__ void __intersection__radiance() { }
extern "C" __global__ void __miss__radiance() {
MissData* missData = reinterpret_cast<MissData*>(optixGetSbtDataPointer());
float3 payload = getPayload();//Why???
setPayload(missData->backgroundColor);
}
extern "C" __global__ void __raygen__renderFrame() {
// compute a test pattern based on pixel ID
const int ix = optixGetLaunchIndex().x;
const int iy = optixGetLaunchIndex().y;
const auto& camera = optixLaunchParams.camera;
// our per-ray data for this example. what we initialize it to
// won't matter, since this value will be overwritten by either
// the miss or hit program, anyway
float3 pixelColorPRD = { 0.5f, 0.5f, 0.5f };
// normalized screen plane position, in [0,1]^2
const vec2f screen(vec2f(ix + .5f, iy + .5f) / vec2f(optixLaunchParams.frame.size));
// generate ray direction
vec3f rayDir = normalize(camera.direction + (screen.x - 0.5f) * camera.horizontal + (screen.y - 0.5f) * camera.vertical);
trace(optixLaunchParams.traversable,
camera.position,
rayDir,
0.f, // tmin
1e16f, // tmax
&pixelColorPRD);
const int r = int(255.99f * pixelColorPRD.x);
const int g = int(255.99f * pixelColorPRD.y);
const int b = int(255.99f * pixelColorPRD.z);
const uint32_t rgba = 0xff000000 | (r << 0) | (g << 8) | (b << 16);
const uint32_t fbIndex = ix + iy * optixLaunchParams.frame.size.x;
optixLaunchParams.frame.colorBuffer[fbIndex] = rgba;
}
}
上面的代码看起来很合理,所以首先猜测是检查创建上下文和管道的标志;特别是管道创建标志的实例级别标志:您最初从中复制和粘贴的示例没有使用实例化,因此几乎可以肯定其管道设置为 'no instancing'(因为如果它 实例化打开它会 期望 看到一个实例!)。
然而,如果您正在创建的 TLASes/BLASes 的实际实例级别与您用于创建管道的级别不匹配,您将找不到任何命中。
我还建议查看我最近的 OWL 项目中的一些代码,也在 github 上——它也执行实例化、多级实例、etcpp。
- OptixPipelineCompileOptions traversableGraphFlags 必须包含 旗帜 OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING
- optixPipelineSetStackSize() 中的 maxTraversableGraphDepth = 2
解决了问题。