HLSL 编译器省略关键语句
HLSL Compiler omitting crucial statements
我有这个遍历二叉树的计算着色器。它过去在单独安装的 DirectX SDK(6 月)和编译器 #43 上运行良好。
编译器 #46 和 #47(分别来自 Windows SDK 8.0 和 8.1)然而似乎省略了两行非常关键的代码,它们将着色器 运行 圈起来,检查相同的树节点一遍又一遍,直到 Windows 重新启动图形驱动程序(通过查看反汇编验证)。
这是展示此行为的最小代码示例:
#define LEFT_PROCESSED 1
#define RIGHT_PROCESSED 2
struct Node
{
float4 min;
float4 max;
int left;
int right;
int parent;
int flags;
};
RWStructuredBuffer<Node> tree: register(u0);
bool TreeSearch()
{
Node node = tree[0];
int nodeId = 0;
int statusStack[40];
int stackSize = 0;
statusStack[0] = 0;
while (true)
{
if (!(statusStack[stackSize] & LEFT_PROCESSED))
{
statusStack[stackSize] |= LEFT_PROCESSED;
++stackSize;
statusStack[stackSize] = 0;
nodeId = node.left;
node = tree[nodeId];
continue;
}
if (!(statusStack[stackSize] & RIGHT_PROCESSED))
{
statusStack[stackSize] |= RIGHT_PROCESSED; // this line
++stackSize;
statusStack[stackSize] = 0; // and this line
nodeId = node.right;
node = tree[nodeId];
continue;
}
if (node.parent != -1)
{
--stackSize;
nodeId = node.parent;
node = tree[nodeId];
}
else
return false;
}
return false;
}
[numthreads(32, 1, 1)]
void CSSearch(uint2 dispatchThreadId: SV_DispatchThreadID)
{
TreeSearch();
}
以及对应的程序集:
cs_5_0
dcl_globalFlags refactoringAllowed
dcl_uav_structured u0, 48
dcl_temps 3
dcl_indexableTemp x0[40], 4
dcl_thread_group 32, 1, 1
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r0.xyz, l(0), l(32), u0.xyzx
mov x0[0].x, l(0)
mov r1.xyz, r0.yzxy
mov r0.w, l(0)
loop
mov r1.w, x0[r0.w + 0].x
and r2.x, r1.w, l(1)
if_z r2.x
or r1.w, r1.w, l(1) // here's the first one in the LEFT branch
mov x0[r0.w + 0].x, r1.w //
iadd r1.w, r0.w, l(1)
mov x0[r1.w + 0].x, l(0) // and the second one
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r2.xyz, r1.z, l(32), u0.yzxx
mov r1.xyz, r2.xyzx
mov r0.w, r1.w
continue
endif
mov r1.w, x0[r0.w + 0].x // why is there nothing in the RIGHT branch?
and r1.w, r1.w, l(2)
if_z r1.w
iadd r1.w, r0.w, l(1)
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r2.xyz, r1.x, l(32), u0.yzxx
mov r1.xyz, r2.xyzx
mov r0.w, r1.w
continue
endif
ine r1.w, r1.y, l(-1)
if_nz r1.w
iadd r0.w, r0.w, l(-1)
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r1.xyz, r1.y, l(32), u0.yzxx
else
break
endif
endloop
ret
当我省略第一个 continue
时,它会为这两行生成代码,但随后它也被破坏了。
知道如何让更新的编译器生成该代码吗?
请注意:我不熟悉GPU编程,不确定这是编译器的问题还是代码的问题。以下只是一个解决方法。
您可以通过使用显式变量来模仿 continue
行为,希望编译器不会妨碍您:
bool TreeSearch()
{
Node node = tree[0];
int nodeId = 0;
int statusStack[40];
int stackSize = 0;
statusStack[0] = 0;
while (stackSize < 10) // Changed to make it compile.
{
int shouldContinue = 1;
if (!(statusStack[stackSize] & LEFT_PROCESSED))
{
statusStack[stackSize] |= LEFT_PROCESSED;
++stackSize;
statusStack[stackSize] = 0;
nodeId = node.left;
node = tree[nodeId];
shouldContinue = 0;
}
if (shouldContinue &&
!(statusStack[stackSize] & RIGHT_PROCESSED))
{
statusStack[stackSize] |= RIGHT_PROCESSED; // this line
++stackSize;
statusStack[stackSize] = 0; // and this line
nodeId = node.right;
node = tree[nodeId];
shouldContinue = 0;
}
if (shouldContinue)
{
if (node.parent != -1)
{
--stackSize;
nodeId = node.parent;
node = tree[nodeId];
}
else
return false;
}
}
return false;
}
反汇编输出似乎没有缺少原始代码段中缺少的任何操作。不过,这可能会产生开销。
Link: http://shader-playground.timjones.io/6abdc64cdf98e1840a3b38c629b4e217
我有这个遍历二叉树的计算着色器。它过去在单独安装的 DirectX SDK(6 月)和编译器 #43 上运行良好。
编译器 #46 和 #47(分别来自 Windows SDK 8.0 和 8.1)然而似乎省略了两行非常关键的代码,它们将着色器 运行 圈起来,检查相同的树节点一遍又一遍,直到 Windows 重新启动图形驱动程序(通过查看反汇编验证)。
这是展示此行为的最小代码示例:
#define LEFT_PROCESSED 1
#define RIGHT_PROCESSED 2
struct Node
{
float4 min;
float4 max;
int left;
int right;
int parent;
int flags;
};
RWStructuredBuffer<Node> tree: register(u0);
bool TreeSearch()
{
Node node = tree[0];
int nodeId = 0;
int statusStack[40];
int stackSize = 0;
statusStack[0] = 0;
while (true)
{
if (!(statusStack[stackSize] & LEFT_PROCESSED))
{
statusStack[stackSize] |= LEFT_PROCESSED;
++stackSize;
statusStack[stackSize] = 0;
nodeId = node.left;
node = tree[nodeId];
continue;
}
if (!(statusStack[stackSize] & RIGHT_PROCESSED))
{
statusStack[stackSize] |= RIGHT_PROCESSED; // this line
++stackSize;
statusStack[stackSize] = 0; // and this line
nodeId = node.right;
node = tree[nodeId];
continue;
}
if (node.parent != -1)
{
--stackSize;
nodeId = node.parent;
node = tree[nodeId];
}
else
return false;
}
return false;
}
[numthreads(32, 1, 1)]
void CSSearch(uint2 dispatchThreadId: SV_DispatchThreadID)
{
TreeSearch();
}
以及对应的程序集:
cs_5_0
dcl_globalFlags refactoringAllowed
dcl_uav_structured u0, 48
dcl_temps 3
dcl_indexableTemp x0[40], 4
dcl_thread_group 32, 1, 1
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r0.xyz, l(0), l(32), u0.xyzx
mov x0[0].x, l(0)
mov r1.xyz, r0.yzxy
mov r0.w, l(0)
loop
mov r1.w, x0[r0.w + 0].x
and r2.x, r1.w, l(1)
if_z r2.x
or r1.w, r1.w, l(1) // here's the first one in the LEFT branch
mov x0[r0.w + 0].x, r1.w //
iadd r1.w, r0.w, l(1)
mov x0[r1.w + 0].x, l(0) // and the second one
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r2.xyz, r1.z, l(32), u0.yzxx
mov r1.xyz, r2.xyzx
mov r0.w, r1.w
continue
endif
mov r1.w, x0[r0.w + 0].x // why is there nothing in the RIGHT branch?
and r1.w, r1.w, l(2)
if_z r1.w
iadd r1.w, r0.w, l(1)
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r2.xyz, r1.x, l(32), u0.yzxx
mov r1.xyz, r2.xyzx
mov r0.w, r1.w
continue
endif
ine r1.w, r1.y, l(-1)
if_nz r1.w
iadd r0.w, r0.w, l(-1)
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r1.xyz, r1.y, l(32), u0.yzxx
else
break
endif
endloop
ret
当我省略第一个 continue
时,它会为这两行生成代码,但随后它也被破坏了。
知道如何让更新的编译器生成该代码吗?
请注意:我不熟悉GPU编程,不确定这是编译器的问题还是代码的问题。以下只是一个解决方法。
您可以通过使用显式变量来模仿 continue
行为,希望编译器不会妨碍您:
bool TreeSearch()
{
Node node = tree[0];
int nodeId = 0;
int statusStack[40];
int stackSize = 0;
statusStack[0] = 0;
while (stackSize < 10) // Changed to make it compile.
{
int shouldContinue = 1;
if (!(statusStack[stackSize] & LEFT_PROCESSED))
{
statusStack[stackSize] |= LEFT_PROCESSED;
++stackSize;
statusStack[stackSize] = 0;
nodeId = node.left;
node = tree[nodeId];
shouldContinue = 0;
}
if (shouldContinue &&
!(statusStack[stackSize] & RIGHT_PROCESSED))
{
statusStack[stackSize] |= RIGHT_PROCESSED; // this line
++stackSize;
statusStack[stackSize] = 0; // and this line
nodeId = node.right;
node = tree[nodeId];
shouldContinue = 0;
}
if (shouldContinue)
{
if (node.parent != -1)
{
--stackSize;
nodeId = node.parent;
node = tree[nodeId];
}
else
return false;
}
}
return false;
}
反汇编输出似乎没有缺少原始代码段中缺少的任何操作。不过,这可能会产生开销。
Link: http://shader-playground.timjones.io/6abdc64cdf98e1840a3b38c629b4e217