为什么 C# 的速度是 C++ 的两倍,即使生成的机器代码几乎相同?
Why C# is twice as slow as C++ even though the generated machine code is nearly identical?
此代码由 .NET Core 3.0 JIT 生成,用于我手动矢量化的 C# 代码:
00007FFE6C7D2103 vmovdqu xmm5,xmmword ptr [rcx]
00007FFE6C7D2107 vmovdqu xmm6,xmmword ptr [rcx+10h]
00007FFE6C7D210C vmovdqu xmm7,xmmword ptr [rcx+20h]
00007FFE6C7D2111 vmovdqu xmm8,xmmword ptr [rcx+30h]
00007FFE6C7D2116 vpand xmm9,xmm5,xmm0
00007FFE6C7D211A vpand xmm10,xmm6,xmm0
00007FFE6C7D211E vpackusdw xmm9,xmm9,xmm10
00007FFE6C7D2123 vpslldq xmm9,xmm9,1
00007FFE6C7D2129 vpand xmm10,xmm5,xmm1
00007FFE6C7D212D vpand xmm11,xmm6,xmm1
00007FFE6C7D2131 vpackusdw xmm10,xmm10,xmm11
00007FFE6C7D2136 vpsrldq xmm5,xmm5,1
00007FFE6C7D213B vpsrldq xmm6,xmm6,1
00007FFE6C7D2140 vpand xmm5,xmm5,xmm1
00007FFE6C7D2144 vpand xmm6,xmm6,xmm1
00007FFE6C7D2148 vpackusdw xmm5,xmm5,xmm6
var low = brightness( r, g, b, redMul, greenMul, blueMul );
00007FFE6C7D214D vpmulhuw xmm9,xmm9,xmm2
00007FFE6C7D2151 vpmulhuw xmm10,xmm10,xmm3
00007FFE6C7D2155 vpmulhuw xmm5,xmm5,xmm4
00007FFE6C7D2159 vpaddusw xmm6,xmm9,xmm10
00007FFE6C7D215E vpaddusw xmm5,xmm6,xmm5
00007FFE6C7D2162 vpsrlw xmm5,xmm5,8
00007FFE6C7D2167 vpand xmm6,xmm7,xmm0
00007FFE6C7D216B vpand xmm9,xmm8,xmm0
00007FFE6C7D216F vpackusdw xmm6,xmm6,xmm9
00007FFE6C7D2174 vpslldq xmm9,xmm6,1
00007FFE6C7D2179 vpand xmm6,xmm7,xmm1
00007FFE6C7D217D vpand xmm10,xmm8,xmm1
00007FFE6C7D2181 vpackusdw xmm10,xmm6,xmm10
00007FFE6C7D2186 vpsrldq xmm6,xmm7,1
00007FFE6C7D218B vpsrldq xmm7,xmm8,1
00007FFE6C7D2191 vpand xmm6,xmm6,xmm1
00007FFE6C7D2195 vpand xmm7,xmm7,xmm1
00007FFE6C7D2199 vpackusdw xmm6,xmm6,xmm7
var hi = brightness( r, g, b, redMul, greenMul, blueMul );
00007FFE6C7D219E vpmulhuw xmm7,xmm9,xmm2
00007FFE6C7D21A2 vpmulhuw xmm8,xmm10,xmm3
00007FFE6C7D21A6 vpmulhuw xmm6,xmm6,xmm4
00007FFE6C7D21AA vpaddusw xmm7,xmm7,xmm8
00007FFE6C7D21AF vpaddusw xmm6,xmm7,xmm6
00007FFE6C7D21B3 vpsrlw xmm6,xmm6,8
00007FFE6C7D21B8 vpackuswb xmm5,xmm5,xmm6
Sse2.Store( dst, bytes );
00007FFE6C7D21BC vmovdqu xmmword ptr [rdx],xmm5
src += 64;
00007FFE6C7D21C0 add rcx,40h
dst += 16;
00007FFE6C7D21C4 add rdx,10h
while( src < srcEnd )
00007FFE6C7D21C8 cmp rcx,rax
00007FFE6C7D21CB jb 00007FFE6C7D2103
此代码由 VC++ 2015 年编译我的手动矢量化 C++ 时生成。
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD11C0 vmovdqu xmm6,xmmword ptr [rcx-10h]
00007FF735AD11C5 vmovdqu xmm7,xmmword ptr [rcx-20h]
loadRgb( src + 2, r, g, b );
00007FF735AD11CA vmovdqu xmm9,xmmword ptr [rcx]
00007FF735AD11CE vmovdqu xmm8,xmmword ptr [rcx+10h]
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD11D3 vpand xmm3,xmm10,xmm6
00007FF735AD11D7 vpand xmm1,xmm11,xmm6
00007FF735AD11DB vpand xmm0,xmm11,xmm7
00007FF735AD11DF vpackusdw xmm1,xmm0,xmm1
00007FF735AD11E4 vpslldq xmm2,xmm1,1
const auto low = brightness( r, g, b );
00007FF735AD11E9 vpmulhuw xmm4,xmm2,xmm12
00007FF735AD11EE vpand xmm0,xmm10,xmm7
00007FF735AD11F2 vpackusdw xmm1,xmm0,xmm3
const auto low = brightness( r, g, b );
00007FF735AD11F7 vpmulhuw xmm2,xmm1,xmm13
00007FF735AD11FC vpaddusw xmm5,xmm4,xmm2
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD1200 vpsrldq xmm0,xmm6,1
00007FF735AD1205 vpand xmm3,xmm0,xmm10
00007FF735AD120A vpsrldq xmm1,xmm7,1
00007FF735AD120F vpand xmm2,xmm1,xmm10
00007FF735AD1214 vpackusdw xmm0,xmm2,xmm3
const auto low = brightness( r, g, b );
00007FF735AD1219 vpmulhuw xmm3,xmm0,xmm14
00007FF735AD121E vpaddusw xmm1,xmm5,xmm3
00007FF735AD1222 vpsrlw xmm6,xmm1,8
loadRgb( src + 2, r, g, b );
00007FF735AD1227 vpand xmm2,xmm11,xmm8
00007FF735AD122C vpand xmm0,xmm11,xmm9
00007FF735AD1231 vpackusdw xmm1,xmm0,xmm2
00007FF735AD1236 vpslldq xmm2,xmm1,1
const auto hi = brightness( r, g, b );
00007FF735AD123B vpmulhuw xmm4,xmm2,xmm12
loadRgb( src + 2, r, g, b );
00007FF735AD1240 vpand xmm0,xmm10,xmm9
00007FF735AD1245 vpand xmm3,xmm10,xmm8
00007FF735AD124A vpackusdw xmm1,xmm0,xmm3
const auto hi = brightness( r, g, b );
00007FF735AD124F vpmulhuw xmm2,xmm1,xmm13
00007FF735AD1254 vpaddusw xmm5,xmm4,xmm2
loadRgb( src + 2, r, g, b );
00007FF735AD1258 vpsrldq xmm1,xmm9,1
00007FF735AD125E vpand xmm2,xmm1,xmm10
00007FF735AD1263 vpsrldq xmm0,xmm8,1
00007FF735AD1269 vpand xmm3,xmm0,xmm10
00007FF735AD126E vpackusdw xmm0,xmm2,xmm3
const auto hi = brightness( r, g, b );
00007FF735AD1273 vpmulhuw xmm3,xmm0,xmm14
00007FF735AD1278 vpaddusw xmm1,xmm5,xmm3
00007FF735AD127C vpsrlw xmm2,xmm1,8
src += 4;
00007FF735AD1281 lea rcx,[rcx+40h]
const auto bytes = packus_epi16( low, hi );
00007FF735AD1285 vpackuswb xmm0,xmm6,xmm2
VecInteger* dest = (VecInteger*)destinationBytes;
while( src < srcEnd )
00007FF735AD1289 lea rax,[rcx-20h]
storeu_all( dest, bytes );
00007FF735AD128D vmovdqu xmmword ptr [rdx],xmm0
dest++;
00007FF735AD1291 lea rdx,[rdx+10h]
00007FF735AD1295 cmp rax,r8
00007FF735AD1298 jb Sse::convertToGrayscale+80h (07FF735AD11C0h)
上面的两个片段都只包括程序的主循环。如您所见,它们具有几乎相同的指令,但 C# 的速度是 C++ 的两倍。
具体来说,当使用 511M 像素进行测试时,我的 PC (AMD Ryzen 5 3600) C++ 代码需要 221 毫秒,C# 代码需要 410 毫秒。
为什么?
有关 C# 源代码,请参阅 Why is C# twice as slow as C++ even though the generated machine code is nearly identical?。
C++ 源代码:https://github.com/Const-me/IntelIntrinsics/blob/master/CppDemo/brightness.cpp https://github.com/Const-me/IntelIntrinsics/blob/master/CppDemo/brightness.inl
原因是 JIT 开销。在对 .NET 代码进行基准测试时,您应该始终放弃第一个度量,因为它包括运行时从 IL 生成 x86 代码所花费的时间。
这是我测量 3 次而不是 1 次(对于 5.11 亿像素)后测试应用打印的内容:
#1 391.1885 ms, #2 216.985 ms, #3 235.5549 ms
源代码:https://gist.github.com/Const-me/0f0c283a0b998aa9977550d85fa33958
这些 ~220 毫秒非常接近等效 C++ 代码的性能。所以 C# SIMD 并没有那么糟糕。
此代码由 .NET Core 3.0 JIT 生成,用于我手动矢量化的 C# 代码:
00007FFE6C7D2103 vmovdqu xmm5,xmmword ptr [rcx]
00007FFE6C7D2107 vmovdqu xmm6,xmmword ptr [rcx+10h]
00007FFE6C7D210C vmovdqu xmm7,xmmword ptr [rcx+20h]
00007FFE6C7D2111 vmovdqu xmm8,xmmword ptr [rcx+30h]
00007FFE6C7D2116 vpand xmm9,xmm5,xmm0
00007FFE6C7D211A vpand xmm10,xmm6,xmm0
00007FFE6C7D211E vpackusdw xmm9,xmm9,xmm10
00007FFE6C7D2123 vpslldq xmm9,xmm9,1
00007FFE6C7D2129 vpand xmm10,xmm5,xmm1
00007FFE6C7D212D vpand xmm11,xmm6,xmm1
00007FFE6C7D2131 vpackusdw xmm10,xmm10,xmm11
00007FFE6C7D2136 vpsrldq xmm5,xmm5,1
00007FFE6C7D213B vpsrldq xmm6,xmm6,1
00007FFE6C7D2140 vpand xmm5,xmm5,xmm1
00007FFE6C7D2144 vpand xmm6,xmm6,xmm1
00007FFE6C7D2148 vpackusdw xmm5,xmm5,xmm6
var low = brightness( r, g, b, redMul, greenMul, blueMul );
00007FFE6C7D214D vpmulhuw xmm9,xmm9,xmm2
00007FFE6C7D2151 vpmulhuw xmm10,xmm10,xmm3
00007FFE6C7D2155 vpmulhuw xmm5,xmm5,xmm4
00007FFE6C7D2159 vpaddusw xmm6,xmm9,xmm10
00007FFE6C7D215E vpaddusw xmm5,xmm6,xmm5
00007FFE6C7D2162 vpsrlw xmm5,xmm5,8
00007FFE6C7D2167 vpand xmm6,xmm7,xmm0
00007FFE6C7D216B vpand xmm9,xmm8,xmm0
00007FFE6C7D216F vpackusdw xmm6,xmm6,xmm9
00007FFE6C7D2174 vpslldq xmm9,xmm6,1
00007FFE6C7D2179 vpand xmm6,xmm7,xmm1
00007FFE6C7D217D vpand xmm10,xmm8,xmm1
00007FFE6C7D2181 vpackusdw xmm10,xmm6,xmm10
00007FFE6C7D2186 vpsrldq xmm6,xmm7,1
00007FFE6C7D218B vpsrldq xmm7,xmm8,1
00007FFE6C7D2191 vpand xmm6,xmm6,xmm1
00007FFE6C7D2195 vpand xmm7,xmm7,xmm1
00007FFE6C7D2199 vpackusdw xmm6,xmm6,xmm7
var hi = brightness( r, g, b, redMul, greenMul, blueMul );
00007FFE6C7D219E vpmulhuw xmm7,xmm9,xmm2
00007FFE6C7D21A2 vpmulhuw xmm8,xmm10,xmm3
00007FFE6C7D21A6 vpmulhuw xmm6,xmm6,xmm4
00007FFE6C7D21AA vpaddusw xmm7,xmm7,xmm8
00007FFE6C7D21AF vpaddusw xmm6,xmm7,xmm6
00007FFE6C7D21B3 vpsrlw xmm6,xmm6,8
00007FFE6C7D21B8 vpackuswb xmm5,xmm5,xmm6
Sse2.Store( dst, bytes );
00007FFE6C7D21BC vmovdqu xmmword ptr [rdx],xmm5
src += 64;
00007FFE6C7D21C0 add rcx,40h
dst += 16;
00007FFE6C7D21C4 add rdx,10h
while( src < srcEnd )
00007FFE6C7D21C8 cmp rcx,rax
00007FFE6C7D21CB jb 00007FFE6C7D2103
此代码由 VC++ 2015 年编译我的手动矢量化 C++ 时生成。
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD11C0 vmovdqu xmm6,xmmword ptr [rcx-10h]
00007FF735AD11C5 vmovdqu xmm7,xmmword ptr [rcx-20h]
loadRgb( src + 2, r, g, b );
00007FF735AD11CA vmovdqu xmm9,xmmword ptr [rcx]
00007FF735AD11CE vmovdqu xmm8,xmmword ptr [rcx+10h]
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD11D3 vpand xmm3,xmm10,xmm6
00007FF735AD11D7 vpand xmm1,xmm11,xmm6
00007FF735AD11DB vpand xmm0,xmm11,xmm7
00007FF735AD11DF vpackusdw xmm1,xmm0,xmm1
00007FF735AD11E4 vpslldq xmm2,xmm1,1
const auto low = brightness( r, g, b );
00007FF735AD11E9 vpmulhuw xmm4,xmm2,xmm12
00007FF735AD11EE vpand xmm0,xmm10,xmm7
00007FF735AD11F2 vpackusdw xmm1,xmm0,xmm3
const auto low = brightness( r, g, b );
00007FF735AD11F7 vpmulhuw xmm2,xmm1,xmm13
00007FF735AD11FC vpaddusw xmm5,xmm4,xmm2
{
VecInteger r, g, b;
loadRgb( src, r, g, b );
00007FF735AD1200 vpsrldq xmm0,xmm6,1
00007FF735AD1205 vpand xmm3,xmm0,xmm10
00007FF735AD120A vpsrldq xmm1,xmm7,1
00007FF735AD120F vpand xmm2,xmm1,xmm10
00007FF735AD1214 vpackusdw xmm0,xmm2,xmm3
const auto low = brightness( r, g, b );
00007FF735AD1219 vpmulhuw xmm3,xmm0,xmm14
00007FF735AD121E vpaddusw xmm1,xmm5,xmm3
00007FF735AD1222 vpsrlw xmm6,xmm1,8
loadRgb( src + 2, r, g, b );
00007FF735AD1227 vpand xmm2,xmm11,xmm8
00007FF735AD122C vpand xmm0,xmm11,xmm9
00007FF735AD1231 vpackusdw xmm1,xmm0,xmm2
00007FF735AD1236 vpslldq xmm2,xmm1,1
const auto hi = brightness( r, g, b );
00007FF735AD123B vpmulhuw xmm4,xmm2,xmm12
loadRgb( src + 2, r, g, b );
00007FF735AD1240 vpand xmm0,xmm10,xmm9
00007FF735AD1245 vpand xmm3,xmm10,xmm8
00007FF735AD124A vpackusdw xmm1,xmm0,xmm3
const auto hi = brightness( r, g, b );
00007FF735AD124F vpmulhuw xmm2,xmm1,xmm13
00007FF735AD1254 vpaddusw xmm5,xmm4,xmm2
loadRgb( src + 2, r, g, b );
00007FF735AD1258 vpsrldq xmm1,xmm9,1
00007FF735AD125E vpand xmm2,xmm1,xmm10
00007FF735AD1263 vpsrldq xmm0,xmm8,1
00007FF735AD1269 vpand xmm3,xmm0,xmm10
00007FF735AD126E vpackusdw xmm0,xmm2,xmm3
const auto hi = brightness( r, g, b );
00007FF735AD1273 vpmulhuw xmm3,xmm0,xmm14
00007FF735AD1278 vpaddusw xmm1,xmm5,xmm3
00007FF735AD127C vpsrlw xmm2,xmm1,8
src += 4;
00007FF735AD1281 lea rcx,[rcx+40h]
const auto bytes = packus_epi16( low, hi );
00007FF735AD1285 vpackuswb xmm0,xmm6,xmm2
VecInteger* dest = (VecInteger*)destinationBytes;
while( src < srcEnd )
00007FF735AD1289 lea rax,[rcx-20h]
storeu_all( dest, bytes );
00007FF735AD128D vmovdqu xmmword ptr [rdx],xmm0
dest++;
00007FF735AD1291 lea rdx,[rdx+10h]
00007FF735AD1295 cmp rax,r8
00007FF735AD1298 jb Sse::convertToGrayscale+80h (07FF735AD11C0h)
上面的两个片段都只包括程序的主循环。如您所见,它们具有几乎相同的指令,但 C# 的速度是 C++ 的两倍。
具体来说,当使用 511M 像素进行测试时,我的 PC (AMD Ryzen 5 3600) C++ 代码需要 221 毫秒,C# 代码需要 410 毫秒。
为什么?
有关 C# 源代码,请参阅 Why is C# twice as slow as C++ even though the generated machine code is nearly identical?。
C++ 源代码:https://github.com/Const-me/IntelIntrinsics/blob/master/CppDemo/brightness.cpp https://github.com/Const-me/IntelIntrinsics/blob/master/CppDemo/brightness.inl
原因是 JIT 开销。在对 .NET 代码进行基准测试时,您应该始终放弃第一个度量,因为它包括运行时从 IL 生成 x86 代码所花费的时间。
这是我测量 3 次而不是 1 次(对于 5.11 亿像素)后测试应用打印的内容:
#1 391.1885 ms, #2 216.985 ms, #3 235.5549 ms
源代码:https://gist.github.com/Const-me/0f0c283a0b998aa9977550d85fa33958
这些 ~220 毫秒非常接近等效 C++ 代码的性能。所以 C# SIMD 并没有那么糟糕。