c 中的静态内联函数
static inline functions in c
我读了另一篇 SO question and answer,它对我来说似乎很有意义,但我还有一个问题要添加它。
投票最多的答案是
For small functions that are called frequently that can make a big
performance difference.
好的,那什么才算小功能呢?
我问的原因是我正在考虑使用数学库,即来自子弹物理框架的矢量数学。他们所有的数学函数都是静态内联的,但有些很短,有些很长。
以下是我认为的简短内容:
static inline void vmathM3Copy( VmathMatrix3 *result, const VmathMatrix3 *mat )
{
vmathV3Copy( &result->col0, &mat->col0 );
vmathV3Copy( &result->col1, &mat->col1 );
vmathV3Copy( &result->col2, &mat->col2 );
}
但即使那样也会嵌入此函数 3 次:
static inline void vmathV3Copy( VmathVector3 *result, const VmathVector3 *vec )
{
result->x = vec->x;
result->y = vec->y;
result->z = vec->z;
}
以下是我觉得比较长的内容:
static inline float vmathM4Determinant( const VmathMatrix4 *mat )
{
float dx, dy, dz, dw, mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
mA = mat->col0.x;
mB = mat->col0.y;
mC = mat->col0.z;
mD = mat->col0.w;
mE = mat->col1.x;
mF = mat->col1.y;
mG = mat->col1.z;
mH = mat->col1.w;
mI = mat->col2.x;
mJ = mat->col2.y;
mK = mat->col2.z;
mL = mat->col2.w;
mM = mat->col3.x;
mN = mat->col3.y;
mO = mat->col3.z;
mP = mat->col3.w;
tmp0 = ( ( mK * mD ) - ( mC * mL ) );
tmp1 = ( ( mO * mH ) - ( mG * mP ) );
tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
tmp3 = ( ( mF * mO ) - ( mN * mG ) );
tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
tmp5 = ( ( mN * mH ) - ( mF * mP ) );
dx = ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) );
dy = ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) );
dz = ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) );
dw = ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) );
return ( ( ( ( mA * dx ) + ( mE * dy ) ) + ( mI * dz ) ) + ( mM * dw ) );
}
甚至这个
static inline void vmathM4Inverse( VmathMatrix4 *result, const VmathMatrix4 *mat )
{
VmathVector4 res0, res1, res2, res3;
float mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, detInv;
mA = mat->col0.x;
mB = mat->col0.y;
mC = mat->col0.z;
mD = mat->col0.w;
mE = mat->col1.x;
mF = mat->col1.y;
mG = mat->col1.z;
mH = mat->col1.w;
mI = mat->col2.x;
mJ = mat->col2.y;
mK = mat->col2.z;
mL = mat->col2.w;
mM = mat->col3.x;
mN = mat->col3.y;
mO = mat->col3.z;
mP = mat->col3.w;
tmp0 = ( ( mK * mD ) - ( mC * mL ) );
tmp1 = ( ( mO * mH ) - ( mG * mP ) );
tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
tmp3 = ( ( mF * mO ) - ( mN * mG ) );
tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
tmp5 = ( ( mN * mH ) - ( mF * mP ) );
vmathV4SetX( &res0, ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) ) );
vmathV4SetY( &res0, ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) ) );
vmathV4SetZ( &res0, ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) ) );
vmathV4SetW( &res0, ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) ) );
detInv = ( 1.0f / ( ( ( ( mA * res0.x ) + ( mE * res0.y ) ) + ( mI * res0.z ) ) + ( mM * res0.w ) ) );
vmathV4SetX( &res1, ( mI * tmp1 ) );
vmathV4SetY( &res1, ( mM * tmp0 ) );
vmathV4SetZ( &res1, ( mA * tmp1 ) );
vmathV4SetW( &res1, ( mE * tmp0 ) );
vmathV4SetX( &res3, ( mI * tmp3 ) );
vmathV4SetY( &res3, ( mM * tmp2 ) );
vmathV4SetZ( &res3, ( mA * tmp3 ) );
vmathV4SetW( &res3, ( mE * tmp2 ) );
vmathV4SetX( &res2, ( mI * tmp5 ) );
vmathV4SetY( &res2, ( mM * tmp4 ) );
vmathV4SetZ( &res2, ( mA * tmp5 ) );
vmathV4SetW( &res2, ( mE * tmp4 ) );
tmp0 = ( ( mI * mB ) - ( mA * mJ ) );
tmp1 = ( ( mM * mF ) - ( mE * mN ) );
tmp2 = ( ( mI * mD ) - ( mA * mL ) );
tmp3 = ( ( mM * mH ) - ( mE * mP ) );
tmp4 = ( ( mI * mC ) - ( mA * mK ) );
tmp5 = ( ( mM * mG ) - ( mE * mO ) );
vmathV4SetX( &res2, ( ( ( mL * tmp1 ) - ( mJ * tmp3 ) ) + res2.x ) );
vmathV4SetY( &res2, ( ( ( mP * tmp0 ) - ( mN * tmp2 ) ) + res2.y ) );
vmathV4SetZ( &res2, ( ( ( mB * tmp3 ) - ( mD * tmp1 ) ) - res2.z ) );
vmathV4SetW( &res2, ( ( ( mF * tmp2 ) - ( mH * tmp0 ) ) - res2.w ) );
vmathV4SetX( &res3, ( ( ( mJ * tmp5 ) - ( mK * tmp1 ) ) + res3.x ) );
vmathV4SetY( &res3, ( ( ( mN * tmp4 ) - ( mO * tmp0 ) ) + res3.y ) );
vmathV4SetZ( &res3, ( ( ( mC * tmp1 ) - ( mB * tmp5 ) ) - res3.z ) );
vmathV4SetW( &res3, ( ( ( mG * tmp0 ) - ( mF * tmp4 ) ) - res3.w ) );
vmathV4SetX( &res1, ( ( ( mK * tmp3 ) - ( mL * tmp5 ) ) - res1.x ) );
vmathV4SetY( &res1, ( ( ( mO * tmp2 ) - ( mP * tmp4 ) ) - res1.y ) );
vmathV4SetZ( &res1, ( ( ( mD * tmp5 ) - ( mC * tmp3 ) ) + res1.z ) );
vmathV4SetW( &res1, ( ( ( mH * tmp4 ) - ( mG * tmp2 ) ) + res1.w ) );
vmathV4ScalarMul( &result->col0, &res0, detInv );
vmathV4ScalarMul( &result->col1, &res1, detInv );
vmathV4ScalarMul( &result->col2, &res2, detInv );
vmathV4ScalarMul( &result->col3, &res3, detInv );
}
编写库的人显然非常了解数学,但是如果您进行大量数学运算并且编译器可能内联所有这些函数,您不会得到更大的文件吗?
您很可能会得到一个更大的文件,因为要内联的代码将在整个程序中多次出现,而不是只出现一次。
如果您想提高性能,更大的文件并不意味着太多,特别是在 TB 磁盘时代。最好拥有一个更大的文件,而不是招致不必要的多个函数调用的开销。
首先,编译器不会内联每个标有static
的函数。这不是 static
关键字的目的。 inline
keyword 就是为了这个目的,但是现在很多编译器都忽略了它。
编译器会仔细决定是否内联函数更好。但基本上你的观察是正确的:为最大速度优化的程序往往更大。例如,如果您查看 GCC Optimization Levels.
,就可以看到这一点
For small functions that are called frequently that can make a big performance difference.
如果函数小到将函数压入堆栈所花费的时间比实际执行函数体的时间长,则该函数可能是一个性能问题。在这种情况下,一个好的编译器会内联函数。但是,如果堆栈推送是执行函数的成本最低的部分,则不太可能发生内联。
内联替换的主要缺点是它通常会使程序代码变大。在极端情况下,这会增加页面错误和缓存未命中,从而降低程序性能。从磁盘读取一页可能需要执行数十万条指令。较差的缓存性能可能会使程序速度减慢两倍。内联时必须合理小心,不要使程序太大以至于分页或缓存问题主导执行时间。
我读了另一篇 SO question and answer,它对我来说似乎很有意义,但我还有一个问题要添加它。
投票最多的答案是
For small functions that are called frequently that can make a big performance difference.
好的,那什么才算小功能呢?
我问的原因是我正在考虑使用数学库,即来自子弹物理框架的矢量数学。他们所有的数学函数都是静态内联的,但有些很短,有些很长。
以下是我认为的简短内容:
static inline void vmathM3Copy( VmathMatrix3 *result, const VmathMatrix3 *mat )
{
vmathV3Copy( &result->col0, &mat->col0 );
vmathV3Copy( &result->col1, &mat->col1 );
vmathV3Copy( &result->col2, &mat->col2 );
}
但即使那样也会嵌入此函数 3 次:
static inline void vmathV3Copy( VmathVector3 *result, const VmathVector3 *vec )
{
result->x = vec->x;
result->y = vec->y;
result->z = vec->z;
}
以下是我觉得比较长的内容:
static inline float vmathM4Determinant( const VmathMatrix4 *mat )
{
float dx, dy, dz, dw, mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
mA = mat->col0.x;
mB = mat->col0.y;
mC = mat->col0.z;
mD = mat->col0.w;
mE = mat->col1.x;
mF = mat->col1.y;
mG = mat->col1.z;
mH = mat->col1.w;
mI = mat->col2.x;
mJ = mat->col2.y;
mK = mat->col2.z;
mL = mat->col2.w;
mM = mat->col3.x;
mN = mat->col3.y;
mO = mat->col3.z;
mP = mat->col3.w;
tmp0 = ( ( mK * mD ) - ( mC * mL ) );
tmp1 = ( ( mO * mH ) - ( mG * mP ) );
tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
tmp3 = ( ( mF * mO ) - ( mN * mG ) );
tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
tmp5 = ( ( mN * mH ) - ( mF * mP ) );
dx = ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) );
dy = ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) );
dz = ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) );
dw = ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) );
return ( ( ( ( mA * dx ) + ( mE * dy ) ) + ( mI * dz ) ) + ( mM * dw ) );
}
甚至这个
static inline void vmathM4Inverse( VmathMatrix4 *result, const VmathMatrix4 *mat )
{
VmathVector4 res0, res1, res2, res3;
float mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, detInv;
mA = mat->col0.x;
mB = mat->col0.y;
mC = mat->col0.z;
mD = mat->col0.w;
mE = mat->col1.x;
mF = mat->col1.y;
mG = mat->col1.z;
mH = mat->col1.w;
mI = mat->col2.x;
mJ = mat->col2.y;
mK = mat->col2.z;
mL = mat->col2.w;
mM = mat->col3.x;
mN = mat->col3.y;
mO = mat->col3.z;
mP = mat->col3.w;
tmp0 = ( ( mK * mD ) - ( mC * mL ) );
tmp1 = ( ( mO * mH ) - ( mG * mP ) );
tmp2 = ( ( mB * mK ) - ( mJ * mC ) );
tmp3 = ( ( mF * mO ) - ( mN * mG ) );
tmp4 = ( ( mJ * mD ) - ( mB * mL ) );
tmp5 = ( ( mN * mH ) - ( mF * mP ) );
vmathV4SetX( &res0, ( ( ( mJ * tmp1 ) - ( mL * tmp3 ) ) - ( mK * tmp5 ) ) );
vmathV4SetY( &res0, ( ( ( mN * tmp0 ) - ( mP * tmp2 ) ) - ( mO * tmp4 ) ) );
vmathV4SetZ( &res0, ( ( ( mD * tmp3 ) + ( mC * tmp5 ) ) - ( mB * tmp1 ) ) );
vmathV4SetW( &res0, ( ( ( mH * tmp2 ) + ( mG * tmp4 ) ) - ( mF * tmp0 ) ) );
detInv = ( 1.0f / ( ( ( ( mA * res0.x ) + ( mE * res0.y ) ) + ( mI * res0.z ) ) + ( mM * res0.w ) ) );
vmathV4SetX( &res1, ( mI * tmp1 ) );
vmathV4SetY( &res1, ( mM * tmp0 ) );
vmathV4SetZ( &res1, ( mA * tmp1 ) );
vmathV4SetW( &res1, ( mE * tmp0 ) );
vmathV4SetX( &res3, ( mI * tmp3 ) );
vmathV4SetY( &res3, ( mM * tmp2 ) );
vmathV4SetZ( &res3, ( mA * tmp3 ) );
vmathV4SetW( &res3, ( mE * tmp2 ) );
vmathV4SetX( &res2, ( mI * tmp5 ) );
vmathV4SetY( &res2, ( mM * tmp4 ) );
vmathV4SetZ( &res2, ( mA * tmp5 ) );
vmathV4SetW( &res2, ( mE * tmp4 ) );
tmp0 = ( ( mI * mB ) - ( mA * mJ ) );
tmp1 = ( ( mM * mF ) - ( mE * mN ) );
tmp2 = ( ( mI * mD ) - ( mA * mL ) );
tmp3 = ( ( mM * mH ) - ( mE * mP ) );
tmp4 = ( ( mI * mC ) - ( mA * mK ) );
tmp5 = ( ( mM * mG ) - ( mE * mO ) );
vmathV4SetX( &res2, ( ( ( mL * tmp1 ) - ( mJ * tmp3 ) ) + res2.x ) );
vmathV4SetY( &res2, ( ( ( mP * tmp0 ) - ( mN * tmp2 ) ) + res2.y ) );
vmathV4SetZ( &res2, ( ( ( mB * tmp3 ) - ( mD * tmp1 ) ) - res2.z ) );
vmathV4SetW( &res2, ( ( ( mF * tmp2 ) - ( mH * tmp0 ) ) - res2.w ) );
vmathV4SetX( &res3, ( ( ( mJ * tmp5 ) - ( mK * tmp1 ) ) + res3.x ) );
vmathV4SetY( &res3, ( ( ( mN * tmp4 ) - ( mO * tmp0 ) ) + res3.y ) );
vmathV4SetZ( &res3, ( ( ( mC * tmp1 ) - ( mB * tmp5 ) ) - res3.z ) );
vmathV4SetW( &res3, ( ( ( mG * tmp0 ) - ( mF * tmp4 ) ) - res3.w ) );
vmathV4SetX( &res1, ( ( ( mK * tmp3 ) - ( mL * tmp5 ) ) - res1.x ) );
vmathV4SetY( &res1, ( ( ( mO * tmp2 ) - ( mP * tmp4 ) ) - res1.y ) );
vmathV4SetZ( &res1, ( ( ( mD * tmp5 ) - ( mC * tmp3 ) ) + res1.z ) );
vmathV4SetW( &res1, ( ( ( mH * tmp4 ) - ( mG * tmp2 ) ) + res1.w ) );
vmathV4ScalarMul( &result->col0, &res0, detInv );
vmathV4ScalarMul( &result->col1, &res1, detInv );
vmathV4ScalarMul( &result->col2, &res2, detInv );
vmathV4ScalarMul( &result->col3, &res3, detInv );
}
编写库的人显然非常了解数学,但是如果您进行大量数学运算并且编译器可能内联所有这些函数,您不会得到更大的文件吗?
您很可能会得到一个更大的文件,因为要内联的代码将在整个程序中多次出现,而不是只出现一次。
如果您想提高性能,更大的文件并不意味着太多,特别是在 TB 磁盘时代。最好拥有一个更大的文件,而不是招致不必要的多个函数调用的开销。
首先,编译器不会内联每个标有static
的函数。这不是 static
关键字的目的。 inline
keyword 就是为了这个目的,但是现在很多编译器都忽略了它。
编译器会仔细决定是否内联函数更好。但基本上你的观察是正确的:为最大速度优化的程序往往更大。例如,如果您查看 GCC Optimization Levels.
,就可以看到这一点For small functions that are called frequently that can make a big performance difference.
如果函数小到将函数压入堆栈所花费的时间比实际执行函数体的时间长,则该函数可能是一个性能问题。在这种情况下,一个好的编译器会内联函数。但是,如果堆栈推送是执行函数的成本最低的部分,则不太可能发生内联。
内联替换的主要缺点是它通常会使程序代码变大。在极端情况下,这会增加页面错误和缓存未命中,从而降低程序性能。从磁盘读取一页可能需要执行数十万条指令。较差的缓存性能可能会使程序速度减慢两倍。内联时必须合理小心,不要使程序太大以至于分页或缓存问题主导执行时间。