优化 WebGL 着色器?
Optimize WebGL shader?
我编写了以下着色器来渲染带有一堆同心圆的图案。最终我想让每个旋转的球体成为一个光发射器来沿着 these lines.
创建一些东西
当然,现在我只是在做最基本的部分来渲染不同的对象。
不幸的是,着色器非常慢(在高端 macbook 上全屏 16fps)。我很确定这是由于我在着色器中有大量的 for 循环和分支。我想知道如何以更优化性能的方式实现我想要实现的几何形状:
编辑:您可以在此处 运行 着色器:https://www.shadertoy.com/view/lssyRH
我缺少的一个明显的优化是,目前所有的片段都是针对整个 24 个周围的圆圈进行检查的。通过检查片段是否与图表的外边界相交来完全放弃这些检查将非常快速和容易。我想我只是想弄清楚做这样的事情的最佳做法是什么。
#define N 10
#define M 5
#define K 24
#define M_PI 3.1415926535897932384626433832795
void mainImage( out vec4 fragColor, in vec2 fragCoord )
{
float aspectRatio = iResolution.x / iResolution.y;
float h = 1.0;
float w = aspectRatio;
vec2 uv = vec2(fragCoord.x / iResolution.x * aspectRatio, fragCoord.y / iResolution.y);
float radius = 0.01;
float orbitR = 0.02;
float orbiterRadius = 0.005;
float centerRadius = 0.002;
float encloseR = 2.0 * orbitR;
float encloserRadius = 0.002;
float spacingX = (w / (float(N) + 1.0));
float spacingY = h / (float(M) + 1.0);
float x = 0.0;
float y = 0.0;
vec4 totalLight = vec4(0.0, 0.0, 0.0, 1.0);
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
// compute the center of the diagram
vec2 center = vec2(spacingX * (float(i) + 1.0), spacingY * (float(j) + 1.0));
x = center.x + orbitR * cos(iGlobalTime);
y = center.y + orbitR * sin(iGlobalTime);
vec2 bulb = vec2(x,y);
if (length(uv - center) < centerRadius) {
// frag intersects white center marker
fragColor = vec4(1.0);
return;
} else if (length(uv - bulb) < radius) {
// intersects rotating "light"
fragColor = vec4(uv,0.5+0.5*sin(iGlobalTime),1.0);
return;
} else {
// intersects one of the enclosing 24 cylinders
for(int k = 0; k < K; k++) {
float theta = M_PI * 2.0 * float(k)/ float(K);
x = center.x + cos(theta) * encloseR;
y = center.y + sin(theta) * encloseR;
vec2 encloser = vec2(x,y);
if (length(uv - encloser) < encloserRadius) {
fragColor = vec4(uv,0.5+0.5*sin(iGlobalTime),1.0);
return;
}
}
}
}
}
}
请记住,您要优化片段着色器,并且只优化片段着色器:
- 将
sin(iGlobalTime)
和 cos(iGlobalTime)
移出循环,它们在整个绘图调用中保持静态,因此无需在每次循环迭代时重新计算它们。
- GPU 尽可能使用矢量化指令集 (SIMD),充分利用它。你通过执行多个标量操作浪费了大量的周期,你可以在其中使用单个向量指令(参见带注释的代码)
[我在这里聪明了三年:关于现代 GPU 如何处理指令,我不确定这个说法是否正确,但它确实有助于提高可读性,甚至可能给编译器提供一两个提示]
- 做你的半径检查平方,保存 sqrt(
length
) 以备不时之需
- 将常量的浮点转换(您的循环限制)替换为浮点常量(智能着色器编译器已经会这样做,但不能指望)
- 在你的着色器中没有未定义的行为(不写入 gl_FragColor)
这是您的着色器的优化和注释版本(仍然包含未定义的行为,就像您提供的行为一样)。注释形式为:
// annotation
// old code, if any
new code
#define N 10
// define float constant N
#define fN 10.
#define M 5
// define float constant M
#define fM 5.
#define K 24
// define float constant K
#define fK 24.
#define M_PI 3.1415926535897932384626433832795
// predefine 2 times PI
#define M_PI2 6.28318531
void mainImage( out vec4 fragColor, in vec2 fragCoord )
{
float aspectRatio = iResolution.x / iResolution.y;
// we dont need these separate
// float h = 1.0;
// float w = aspectRatio;
// use vector ops(2 divs 1 mul => 1 div 1 mul)
// vec2 uv = vec2(fragCoord.x / iResolution.x * aspectRatio, fragCoord.y / iResolution.y);
vec2 uv = fragCoord.xy / iResolution.xy;
uv.x *= aspectRatio;
// most of the following declarations should be predefined or marked as "const"...
float radius = 0.01;
// precalc squared radius
float radius2 = radius*radius;
float orbitR = 0.02;
float orbiterRadius = 0.005;
float centerRadius = 0.002;
// precalc squared center radius
float centerRadius2 = centerRadius * centerRadius;
float encloseR = 2.0 * orbitR;
float encloserRadius = 0.002;
// precalc squared encloser radius
float encloserRadius2 = encloserRadius * encloserRadius;
// Use float constants and vector ops here(2 casts 2 adds 2 divs => 1 add 1 div)
// float spacingX = w / (float(N) + 1.0);
// float spacingY = h / (float(M) + 1.0);
vec2 spacing = vec2(aspectRatio, 1.0) / (vec2(fN, fM)+1.);
// calc sin and cos of global time
// saves N*M(sin,cos,2 muls)
vec2 stct = vec2(sin(iGlobalTime), cos(iGlobalTime));
vec2 orbit = orbitR * stct;
// not needed anymore
// float x = 0.0;
// float y = 0.0;
// was never used
// vec4 totalLight = vec4(0.0, 0.0, 0.0, 1.0);
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
// compute the center of the diagram
// Use vector ops
// vec2 center = vec2(spacingX * (float(i) + 1.0), spacingY * (float(j) + 1.0));
vec2 center = spacing * (vec2(i,j)+1.0);
// Again use vector opts, use precalced time trig(orbit = orbitR * stct)
// x = center.x + orbitR * cos(iGlobalTime);
// y = center.y + orbitR * sin(iGlobalTime);
// vec2 bulb = vec2(x,y);
vec2 bulb = center + orbit;
// calculate offsets
vec2 centerOffset = uv - center;
vec2 bulbOffset = uv - bulb;
// use squared length check
// if (length(uv - center) < centerRadius) {
if (dot(centerOffset, centerOffset) < centerRadius2) {
// frag intersects white center marker
fragColor = vec4(1.0);
return;
// use squared length check
// } else if (length(uv - bulb) < radius) {
} else if (dot(bulbOffset, bulbOffset) < radius2) {
// Use precalced sin global time in stct.x
// intersects rotating "light"
fragColor = vec4(uv,0.5+0.5*stct.x,1.0);
return;
} else {
// intersects one of the enclosing 24 cylinders
for(int k = 0; k < K; k++) {
// use predefined 2*PI and float K
float theta = M_PI2 * float(k) / fK;
// Use vector ops(2 muls 2 adds => 1 mul 1 add)
// x = center.x + cos(theta) * encloseR;
// y = center.y + sin(theta) * encloseR;
// vec2 encloser = vec2(x,y);
vec2 encloseOffset = uv - (center + vec2(cos(theta),sin(theta)) * encloseR);
if (dot(encloseOffset,encloseOffset) < encloserRadius2) {
fragColor = vec4(uv,0.5+0.5*stct.x,1.0);
return;
}
}
}
}
}
}
我多想了一点……我意识到优化它的最佳方法是实际更改逻辑,以便在对小圆圈进行相交测试之前检查圆圈组的边界。这使其达到 60fps 的 运行:
我编写了以下着色器来渲染带有一堆同心圆的图案。最终我想让每个旋转的球体成为一个光发射器来沿着 these lines.
创建一些东西当然,现在我只是在做最基本的部分来渲染不同的对象。
不幸的是,着色器非常慢(在高端 macbook 上全屏 16fps)。我很确定这是由于我在着色器中有大量的 for 循环和分支。我想知道如何以更优化性能的方式实现我想要实现的几何形状:
编辑:您可以在此处 运行 着色器:https://www.shadertoy.com/view/lssyRH
我缺少的一个明显的优化是,目前所有的片段都是针对整个 24 个周围的圆圈进行检查的。通过检查片段是否与图表的外边界相交来完全放弃这些检查将非常快速和容易。我想我只是想弄清楚做这样的事情的最佳做法是什么。
#define N 10
#define M 5
#define K 24
#define M_PI 3.1415926535897932384626433832795
void mainImage( out vec4 fragColor, in vec2 fragCoord )
{
float aspectRatio = iResolution.x / iResolution.y;
float h = 1.0;
float w = aspectRatio;
vec2 uv = vec2(fragCoord.x / iResolution.x * aspectRatio, fragCoord.y / iResolution.y);
float radius = 0.01;
float orbitR = 0.02;
float orbiterRadius = 0.005;
float centerRadius = 0.002;
float encloseR = 2.0 * orbitR;
float encloserRadius = 0.002;
float spacingX = (w / (float(N) + 1.0));
float spacingY = h / (float(M) + 1.0);
float x = 0.0;
float y = 0.0;
vec4 totalLight = vec4(0.0, 0.0, 0.0, 1.0);
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
// compute the center of the diagram
vec2 center = vec2(spacingX * (float(i) + 1.0), spacingY * (float(j) + 1.0));
x = center.x + orbitR * cos(iGlobalTime);
y = center.y + orbitR * sin(iGlobalTime);
vec2 bulb = vec2(x,y);
if (length(uv - center) < centerRadius) {
// frag intersects white center marker
fragColor = vec4(1.0);
return;
} else if (length(uv - bulb) < radius) {
// intersects rotating "light"
fragColor = vec4(uv,0.5+0.5*sin(iGlobalTime),1.0);
return;
} else {
// intersects one of the enclosing 24 cylinders
for(int k = 0; k < K; k++) {
float theta = M_PI * 2.0 * float(k)/ float(K);
x = center.x + cos(theta) * encloseR;
y = center.y + sin(theta) * encloseR;
vec2 encloser = vec2(x,y);
if (length(uv - encloser) < encloserRadius) {
fragColor = vec4(uv,0.5+0.5*sin(iGlobalTime),1.0);
return;
}
}
}
}
}
}
请记住,您要优化片段着色器,并且只优化片段着色器:
- 将
sin(iGlobalTime)
和cos(iGlobalTime)
移出循环,它们在整个绘图调用中保持静态,因此无需在每次循环迭代时重新计算它们。 - GPU 尽可能使用矢量化指令集 (SIMD),充分利用它。你通过执行多个标量操作浪费了大量的周期,你可以在其中使用单个向量指令(参见带注释的代码) [我在这里聪明了三年:关于现代 GPU 如何处理指令,我不确定这个说法是否正确,但它确实有助于提高可读性,甚至可能给编译器提供一两个提示]
- 做你的半径检查平方,保存 sqrt(
length
) 以备不时之需 - 将常量的浮点转换(您的循环限制)替换为浮点常量(智能着色器编译器已经会这样做,但不能指望)
- 在你的着色器中没有未定义的行为(不写入 gl_FragColor)
这是您的着色器的优化和注释版本(仍然包含未定义的行为,就像您提供的行为一样)。注释形式为:
// annotation // old code, if any new code
#define N 10
// define float constant N
#define fN 10.
#define M 5
// define float constant M
#define fM 5.
#define K 24
// define float constant K
#define fK 24.
#define M_PI 3.1415926535897932384626433832795
// predefine 2 times PI
#define M_PI2 6.28318531
void mainImage( out vec4 fragColor, in vec2 fragCoord )
{
float aspectRatio = iResolution.x / iResolution.y;
// we dont need these separate
// float h = 1.0;
// float w = aspectRatio;
// use vector ops(2 divs 1 mul => 1 div 1 mul)
// vec2 uv = vec2(fragCoord.x / iResolution.x * aspectRatio, fragCoord.y / iResolution.y);
vec2 uv = fragCoord.xy / iResolution.xy;
uv.x *= aspectRatio;
// most of the following declarations should be predefined or marked as "const"...
float radius = 0.01;
// precalc squared radius
float radius2 = radius*radius;
float orbitR = 0.02;
float orbiterRadius = 0.005;
float centerRadius = 0.002;
// precalc squared center radius
float centerRadius2 = centerRadius * centerRadius;
float encloseR = 2.0 * orbitR;
float encloserRadius = 0.002;
// precalc squared encloser radius
float encloserRadius2 = encloserRadius * encloserRadius;
// Use float constants and vector ops here(2 casts 2 adds 2 divs => 1 add 1 div)
// float spacingX = w / (float(N) + 1.0);
// float spacingY = h / (float(M) + 1.0);
vec2 spacing = vec2(aspectRatio, 1.0) / (vec2(fN, fM)+1.);
// calc sin and cos of global time
// saves N*M(sin,cos,2 muls)
vec2 stct = vec2(sin(iGlobalTime), cos(iGlobalTime));
vec2 orbit = orbitR * stct;
// not needed anymore
// float x = 0.0;
// float y = 0.0;
// was never used
// vec4 totalLight = vec4(0.0, 0.0, 0.0, 1.0);
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
// compute the center of the diagram
// Use vector ops
// vec2 center = vec2(spacingX * (float(i) + 1.0), spacingY * (float(j) + 1.0));
vec2 center = spacing * (vec2(i,j)+1.0);
// Again use vector opts, use precalced time trig(orbit = orbitR * stct)
// x = center.x + orbitR * cos(iGlobalTime);
// y = center.y + orbitR * sin(iGlobalTime);
// vec2 bulb = vec2(x,y);
vec2 bulb = center + orbit;
// calculate offsets
vec2 centerOffset = uv - center;
vec2 bulbOffset = uv - bulb;
// use squared length check
// if (length(uv - center) < centerRadius) {
if (dot(centerOffset, centerOffset) < centerRadius2) {
// frag intersects white center marker
fragColor = vec4(1.0);
return;
// use squared length check
// } else if (length(uv - bulb) < radius) {
} else if (dot(bulbOffset, bulbOffset) < radius2) {
// Use precalced sin global time in stct.x
// intersects rotating "light"
fragColor = vec4(uv,0.5+0.5*stct.x,1.0);
return;
} else {
// intersects one of the enclosing 24 cylinders
for(int k = 0; k < K; k++) {
// use predefined 2*PI and float K
float theta = M_PI2 * float(k) / fK;
// Use vector ops(2 muls 2 adds => 1 mul 1 add)
// x = center.x + cos(theta) * encloseR;
// y = center.y + sin(theta) * encloseR;
// vec2 encloser = vec2(x,y);
vec2 encloseOffset = uv - (center + vec2(cos(theta),sin(theta)) * encloseR);
if (dot(encloseOffset,encloseOffset) < encloserRadius2) {
fragColor = vec4(uv,0.5+0.5*stct.x,1.0);
return;
}
}
}
}
}
}
我多想了一点……我意识到优化它的最佳方法是实际更改逻辑,以便在对小圆圈进行相交测试之前检查圆圈组的边界。这使其达到 60fps 的 运行: