正确创建 Optix 7.1 TLAS 实例加速结构

Proper creation of Optix 7.1 TLAS Instance Acceleration Structure

我正在尝试弄清楚如何正确构建 TLAS。 使用 OptiX 7.1 和 Ingo Wald's Optix 7 samples 附带的示例,从一个三角形开始(只是保存几何图形的 BLAS)并且工作正常(将 SDK 的三角形示例移至 Wald 的示例框架)。 接下来,我引入了一个带有一个实例的 TLAS(即之前的 BLAS),并在着色器中使用了该 TLAS,但我没有获得任何成功。 我做错了什么?

OptixTraversableHandle SampleRenderer::buildAccelerator() {

    OptixTraversableHandle geometryAcceleratorHandle{ 0 };
    CUdeviceptr dAcceleratorBuffer;
    OptixAccelBuildOptions acceleratorOptions{};
    acceleratorOptions.buildFlags = OPTIX_BUILD_FLAG_NONE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
    acceleratorOptions.operation = OPTIX_BUILD_OPERATION_BUILD;

    //Triangle build input: simple list of three vertices
    const std::array<float3, 3> vertices{ { { 0.33f, 0.33f, 0.0f },{  0.33f, -0.33f, 0.0f },{  0.66f,  0.33f, 0.0f }} };
    const size_t verticesSize = sizeof(float3) * vertices.size();
    CUdeviceptr dVertices{ 0ull };
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dVertices), verticesSize));
    CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dVertices), vertices.data(), verticesSize, cudaMemcpyHostToDevice));

    //Build input is a simple list of non-indexed triangle vertices
    const uint32_t triangleInputFlags{ OPTIX_GEOMETRY_FLAG_NONE };
    OptixBuildInput triangleInput{};
    triangleInput.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
    triangleInput.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
    triangleInput.triangleArray.numVertices = static_cast<uint32_t>(vertices.size());
    triangleInput.triangleArray.vertexBuffers = &dVertices;
    triangleInput.triangleArray.flags = &triangleInputFlags;
    triangleInput.triangleArray.numSbtRecords = 1u;

    OptixAccelBufferSizes blasBufferSizes;
    OPTIX_CHECK(optixAccelComputeMemoryUsage(optixContext, &acceleratorOptions, &triangleInput, 1, &blasBufferSizes));
    CUdeviceptr dTempBuffer;
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dTempBuffer), blasBufferSizes.tempSizeInBytes));

    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dAcceleratorBuffer), blasBufferSizes.outputSizeInBytes));

    OPTIX_CHECK(
        optixAccelBuild(
            optixContext,
            0,
            &acceleratorOptions,
            &triangleInput,
            1,
            dTempBuffer,
            blasBufferSizes.tempSizeInBytes,
            dAcceleratorBuffer,
            blasBufferSizes.outputSizeInBytes,
            &geometryAcceleratorHandle,
            nullptr,
            0)
        );

    CUDA_CHECK(Free((void*)dTempBuffer));
    CUDA_CHECK(Free((void*)dVertices));

    return geometryAcceleratorHandle;
}

我没有使用上面函数的 return 值,而是将它提供给下面的 TLAS 创建函数,并在着色器中使用它的输出句柄:

OptixTraversableHandle SampleRenderer::buildInstanceAccelerator(const OptixTraversableHandle& geoHandle){
    OptixInstance optixInstance = { { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f } };
    optixInstance.flags = OPTIX_INSTANCE_FLAG_NONE;
    optixInstance.instanceId = 0u;
    optixInstance.sbtOffset = 0u;
    optixInstance.visibilityMask = 1u;
    optixInstance.traversableHandle = geoHandle;
    CUdeviceptr dOptixInstance;
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dOptixInstance), sizeof(OptixInstance)));
    CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dOptixInstance), &optixInstance, sizeof(OptixInstance), cudaMemcpyHostToDevice));

    OptixAabb optixAabb[2]{
        { -1.5f, -1.0f, -0.5f,
          -0.5f,  0.5f,  0.5f  },
        {  0.5f,  0.0f, -0.01f,
           1.5f,  1.5f,  0.01f } };
    CUdeviceptr  dAabb;
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dAabb), 2 * sizeof(OptixAabb)));
    CUDA_CHECK(Memcpy(reinterpret_cast<void*>(dAabb), optixAabb, 2 * sizeof(OptixAabb), cudaMemcpyHostToDevice));
    OptixBuildInput instanceBuildInput{};
    instanceBuildInput.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
    instanceBuildInput.instanceArray.instances = dOptixInstance;
    instanceBuildInput.instanceArray.numInstances = 1u;
    instanceBuildInput.instanceArray.aabbs = dAabb;
    instanceBuildInput.instanceArray.numAabbs =1u;

    OptixAccelBuildOptions acceleratorBuildOptions{};
    acceleratorBuildOptions.buildFlags = OPTIX_BUILD_FLAG_NONE;
    acceleratorBuildOptions.operation = OPTIX_BUILD_OPERATION_BUILD;

    OptixAccelBufferSizes acceleratorBufferSizes;
    OPTIX_CHECK(optixAccelComputeMemoryUsage(
        optixContext,
        &acceleratorBuildOptions,
        &instanceBuildInput,
        1u,
        &acceleratorBufferSizes));
    CUdeviceptr dTempBuffer;
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dTempBuffer), acceleratorBufferSizes.tempSizeInBytes));
    CUdeviceptr dInstanceAcceleratorBuffer;
    CUDA_CHECK(Malloc(reinterpret_cast<void**>(&dInstanceAcceleratorBuffer), acceleratorBufferSizes.outputSizeInBytes));
    
    OptixTraversableHandle instanceAcceleratorHandle{ 0 };
    OPTIX_CHECK(optixAccelBuild(
        optixContext,
        0,
        &acceleratorBuildOptions,
        &instanceBuildInput,
        1,
        dTempBuffer,
        acceleratorBufferSizes.tempSizeInBytes,
        dInstanceAcceleratorBuffer,
        acceleratorBufferSizes.outputSizeInBytes,
        &instanceAcceleratorHandle,
        nullptr,
        0));
    return instanceAcceleratorHandle;
}

参考着色器代码(当 OptixTraversableHandle 来自上面的第一个函数时它工作得很好:

namespace osc {

extern "C" __constant__ LaunchParams optixLaunchParams;
//Single ray type
enum { SURFACE_RAY_TYPE = 0, RAY_TYPE_COUNT };

static __forceinline__ __device__ void* unpackPointer(uint32_t i0, uint32_t i1) {
    const uint64_t uptr = static_cast<uint64_t>(i0) << 32 | i1;
    void* ptr = reinterpret_cast<void*>(uptr);
    return ptr;
}

static __forceinline__ __device__ void  packPointer(void* ptr, uint32_t& i0, uint32_t& i1) {
    const uint64_t uptr = reinterpret_cast<uint64_t>(ptr);
    i0 = uptr >> 32;
    i1 = uptr & 0x00000000ffffffff;
}

template<typename T>    static __forceinline__ __device__ T* getPRD() {
    const uint32_t u0 = optixGetPayload_0();
    const uint32_t u1 = optixGetPayload_1();
    return reinterpret_cast<T*>(unpackPointer(u0, u1));
}

static __forceinline__ __device__ void trace(
    OptixTraversableHandle handle,
    vec3f                 ray_origin,
    vec3f                 ray_direction,
    float                  tmin,
    float                  tmax,
    float3* prd) {
    unsigned int p0, p1, p2;
    p0 = float_as_int(prd->x);
    p1 = float_as_int(prd->y);
    p2 = float_as_int(prd->z);
    optixTrace(
        handle,
        ray_origin,
        ray_direction,
        tmin,
        tmax,
        0.0f,                // rayTime
        OptixVisibilityMask(1),
        OPTIX_RAY_FLAG_DISABLE_ANYHIT,//OPTIX_RAY_FLAG_NONE,
        0,                   // SBT offset
        0,                   // SBT stride
        0,                   // missSBTIndex
        p0, 
        p1, 
        p2);
    prd->x = int_as_float(p0);
    prd->y = int_as_float(p1);
    prd->z = int_as_float(p2);
}

static __forceinline__ __device__ void setPayload(float3 p) {
    optixSetPayload_0(float_as_int(p.x));
    optixSetPayload_1(float_as_int(p.y));
    optixSetPayload_2(float_as_int(p.z));
}

static __forceinline__ __device__ float3 getPayload() {
    return make_float3(int_as_float(optixGetPayload_0()), int_as_float(optixGetPayload_1()), int_as_float(optixGetPayload_2()));
}

extern "C" __global__ void __closesthit__radiance() {
    //When built-in triangle intersection is used, a number of fundamental
    //attributes are provided by the OptiX API, including barycentric coordinates
    const float2 barycentricCoordinates = optixGetTriangleBarycentrics();
    setPayload(make_float3(barycentricCoordinates.x, barycentricCoordinates.y, 1.f - barycentricCoordinates.x - barycentricCoordinates.y));
}

extern "C" __global__ void __anyhit__radiance() {  }

extern "C" __global__ void __intersection__radiance() { }

extern "C" __global__ void __miss__radiance() {
    MissData* missData = reinterpret_cast<MissData*>(optixGetSbtDataPointer());
    float3 payload = getPayload();//Why???
    setPayload(missData->backgroundColor);
}

extern "C" __global__ void __raygen__renderFrame() {
    // compute a test pattern based on pixel ID
    const int ix = optixGetLaunchIndex().x;
    const int iy = optixGetLaunchIndex().y;

    const auto& camera = optixLaunchParams.camera;

    // our per-ray data for this example. what we initialize it to
    // won't matter, since this value will be overwritten by either
    // the miss or hit program, anyway
    float3 pixelColorPRD = { 0.5f, 0.5f, 0.5f };
    // normalized screen plane position, in [0,1]^2
    const vec2f screen(vec2f(ix + .5f, iy + .5f) / vec2f(optixLaunchParams.frame.size));

    // generate ray direction
    vec3f rayDir = normalize(camera.direction + (screen.x - 0.5f) * camera.horizontal + (screen.y - 0.5f) * camera.vertical);

    trace(optixLaunchParams.traversable,
        camera.position,
        rayDir,
        0.f,    // tmin
        1e16f,  // tmax
        &pixelColorPRD);

    const int r = int(255.99f * pixelColorPRD.x);
    const int g = int(255.99f * pixelColorPRD.y);
    const int b = int(255.99f * pixelColorPRD.z);

    const uint32_t rgba = 0xff000000 | (r << 0) | (g << 8) | (b << 16);

    const uint32_t fbIndex = ix + iy * optixLaunchParams.frame.size.x;
    optixLaunchParams.frame.colorBuffer[fbIndex] = rgba;
}

}

上面的代码看起来很合理,所以首先猜测是检查创建上下文和管道的标志;特别是管道创建标志的实例级别标志:您最初从中复制和粘贴的示例没有使用实例化,因此几乎可以肯定其管道设置为 'no instancing'(因为如果它 实例化打开它会 期望 看到一个实例!)。

然而,如果您正在创建的 TLASes/BLASes 的实际实例级别与您用于创建管道的级别不匹配,您将找不到任何命中。

我还建议查看我最近的 OWL 项目中的一些代码,也在 github 上——它也执行实例化、多级实例、etcpp。

  1. OptixPipelineCompileOptions traversableGraphFlags 必须包含 旗帜 OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING
  2. optixPipelineSetStackSize() 中的 maxTraversableGraphDepth = 2

解决了问题。