奇怪的计算着色器延迟

Weird compute shader latency

我正在尝试通过计算着色器进行截头体剔除。为此,我有一对用于实例化顶点属性的缓冲区,以及一对用于间接绘制命令的缓冲区。我的计算着色器检查第一个缓冲区的实例坐标是否在边界体积内,引用第一个绘制缓冲区的计数,subgroupBallotbitCount 以查看子组内的偏移量,然后添加来自其他子组的结果和全局偏移量,最后将结果存储在第二个缓冲区中。全局偏移量存储在第二个间接绘制缓冲区中。

问题是,在负载下,平截头体可能会延迟几帧 (>1) 到移动的相机,边缘有宽线消失的物体。这对我来说似乎很奇怪,因为剔除和渲染是在同一个命令缓冲区中完成的。

在 renderdoc 中进行捕获、使用屏幕截图 alt+printScreen 或暂停呈现线程时,事情会恢复到应有的状态。

我唯一的猜测是,即使开始绘制新帧,过去帧的计算着色器也会继续执行,尽管由于管道障碍,这不应该发生。

着色器代码:

#version 460

#extension GL_KHR_shader_subgroup_ballot : require

struct drawData{
    uint indexCount;
    uint instanceCount;
    uint firstIndex;
    uint vertexOffset;
    uint firstInstance;
};

struct instanceData{
    float x, y, z;
    float a, b, c, d;
};

layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;

layout(set = 0, binding = 0) uniform A
{
    mat4 cam;
    vec4 camPos;
    vec4 l;
    vec4 t;
    vec4 r;
    vec4 b;
};

layout(set = 0, binding = 1) buffer B
{
    uint count;
    drawData data[];
} Draw[2];

layout(set = 0, binding = 2) buffer C
{
    instanceData data[];
} Instance[2];

shared uint offsetsM[32];

void main()
{
    const uint gID = gl_LocalInvocationID.x;
    const uint lID = gl_SubgroupInvocationID;
    const uint patchSize = gl_WorkGroupSize.x;
        Draw[1].data[0] = Draw[0].data[0];//copy data like index count
        
    Draw[1].count = Draw[0].count;
    
    uint offsetG = 0;//accumulating offset within end buffer
    
    uint loops = Draw[0].data[0].instanceCount/patchSize;//constant loop count
    for(uint i = 0; i<loops;++i){
        uint posa = i*patchSize+gID;//runs better this way for some reason
        
        vec3   pos  = camPos.xyz-vec3(Instance[0].data[posa].x, Instance[0].data[posa].y, Instance[0].data[posa].z);//position relative to camera
        mat4x3 lrtb = mat4x3(l.xyz, r.xyz, t.xyz, b.xyz);
        vec4   dist = pos*lrtb+Model.data[0].rad;//dot products and radius tolerance
        bool   Pass = posa<Draw[0].data[0].instanceCount&&//is real
                     (dot(pos, pos)<l.w*l.w)            &&//not too far
                  all(greaterThan(dist, vec4(0)));        //within view frustum
        
        subgroupBarrier();//no idea what is the best, put what works
        uvec4 actives = subgroupBallot(Pass);//count passed instances
        if(subgroupElect())
            offsetsM[gl_SubgroupID] = bitCount(actives).x+bitCount(actives).y;
        barrier();
        
            uint offsetL = bitCount(actives&gl_SubgroupLtMask).x+bitCount(actives&gl_SubgroupLtMask).y;//offset withing subgroup
            uint ii = 0;
        if(Pass){
             for(; ii<gl_SubgroupID; ++ii)
                 offsetG+= offsetsM[ii];//offsets before subgroup
             Instance[1].data[offsetG+offsetL] = Instance[0].data[posa];
             for(; ii<gl_NumSubgroups; ++ii)
                 offsetG+= offsetsM[ii];}//offsets after subgroup
        else for(; ii<gl_NumSubgroups; ++ii)
                 offsetG+= offsetsM[ii];//same but no data copying
    }
    if(gID == 0)
        Draw[1].data[0].instanceCount = offsetG;
}

对于计算后的渲染通道,我有依赖项:

{//1
deps[1].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[1].dstSubpass = 0;
deps[1].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[1].dstStageMask = VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
deps[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[1].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
deps[1].dependencyFlags = 0;
}
{//2
deps[2].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[2].dstSubpass = 0;
deps[2].srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
deps[2].dstStageMask = VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
deps[2].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[2].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
deps[2].dependencyFlags = 0;
}

命令缓冲区(按原样完全重用,交换链中的每个图像一个):

vkBeginCommandBuffer(cmd, &begInfo);

    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, layoutsPipe[1],
                            0, 1, &descs[1], 0, 0);
    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipes[1]);
    vkCmdDispatch(cmd, 1, 1, 1);

    VkBufferMemoryBarrier bufMemBar[2];
    {//mem bars
        {//0 indirect
            bufMemBar[0].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
            bufMemBar[0].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
            bufMemBar[0].buffer = bufferIndirect;
            bufMemBar[0].offset = 0;
            bufMemBar[0].size   = -1;
        }
        {//1 vertex instance
            bufMemBar[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
            bufMemBar[1].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
            bufMemBar[1].buffer = bufferInstance;
            bufMemBar[1].offset = 0;
            bufMemBar[1].size   = -1;
        }
    }
    vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                         VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, 0, 0, 0, 1, &bufMemBar[0], 0, 0);
    vkCmdPipelineBarrier(cmd, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                         VK_PIPELINE_STAGE_VERTEX_INPUT_BIT , 0, 0, 0, 1, &bufMemBar[1], 0, 0);

    VkRenderPassBeginInfo passBegInfo;
    passBegInfo.renderPass  = pass;
    passBegInfo.framebuffer = chain.frames[i];
    passBegInfo.renderArea  = {{0, 0}, chain.dim};
        VkClearValue clears[2]{{0},{0}};
    passBegInfo.clearValueCount = 2;
    passBegInfo.pClearValues    = clears;
vkCmdBeginRenderPass(cmd, &passBegInfo, VK_SUBPASS_CONTENTS_INLINE);
    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, layoutsPipe[0], 0, 1, &descs[0], 0, 0);
    vkCmdBindPipeline      (cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipes[0]);
        VkBuffer     buffersVertex[2]{bufferVertexProto, bufferInstance};
        VkDeviceSize offsetsVertex[2]{0, 0};
    vkCmdBindVertexBuffers(cmd, 0, 2, buffersVertex, offsetsVertex);
    vkCmdBindIndexBuffer  (cmd, bufferIndex, 0, VK_INDEX_TYPE_UINT32);

    vkCmdDrawIndexedIndirectCount(cmd, bufferIndirect, 0+4,
                                       bufferIndirect, 0,
                                  count.maxDraws, sizeof(VkDrawIndexedIndirectCommand));
vkCmdEndRenderPass(cmd);

vkEndCommandBuffer(cmd);

渲染和呈现与两个信号量同步 - imageAvailable 和 renderFinished。 CPU 上的 Frustum 计算顺序正确。验证层已启用。

问题是我缺少主机同步。事实上,即使在同一个命令缓冲区中,也没有主机同步保证(这是有道理的,因为它使我们能够使用事件)。