为什么编译器以不同的方式优化这些情况?
Why is the compiler optimizing these cases differently?
下面的代码片段演示了一种情况,其中以两种不同的方式在 7 字节 数据上调用 CRC32 编译器内部函数(例如 case0() & case1()) 导致不同的编译器优化。编译器优化的这些差异会产生截然不同的执行时间(例如 [Test_Construction,案例:0,字节:7])。
作为参考,我包含了以相同方式对 6 字节数据调用 CRC32 的逻辑。但是,正如您从生成的输出中看到的那样,所产生的执行时间不会受到与处理 7 字节数据时相同的性能影响。
单遍的生成输出
- 对每个感兴趣的数据大小(6 和 7 字节)进行 4 次独特测试:
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
我有两个问题:
- Why is the compiler producing different optimizations
(e.g. specifically in the case of [Test_Construction, Case: 0, Bytes: 7]?
- It looks like when [Test_Construction, Case: 0, Bytes: 7] is translated into machine code it contains additional instructions which move data from the stack into the registers and then back out on the stack. This does not seem to occur in any other scenario. CRC is then called once on data found within a register and once on data on the stack. Why would it do this?
- Why is the performance dropping in the first place?
- Is it due to the additional stack logic (memory operations) found in [Test_Construction, Case: 0, Bytes: 7] machine code?
- Could the order of the operations contribute?
- Is there a way to stop the optimizer from producing this suboptimal machine code?
更新 1 - 2017 年 4 月 7 日:
- @1201ProgramAlarm,johnnycrash
- 我只是想澄清一下,我想optimize/reduce生成机器码。我故意重叠 [Case: 0, Bytes: 7] 中的第 4 个字节,以便调用 CRC32_u32 两次以避免必须进行以下 3 次调用:CRC32_u32 + CRC32_u16 + CRC32_u8.
- 作为对 johnnycrash 的建议的跟进,我试图在 CFunc 的构造函数中完全删除对 memcpy 的调用,特别是在数据大小为 7 字节的情况下。直接看下面的代码。但是,这对执行时间没有影响。
.
template<int N>
void MemCpy(char* szDst, const char* szSrc) {
memcpy(szDst, szSrc, N);
}
// I tried both of these alternatives to memcpy, no luck.
template<> void MemCpy<7>(char* szDst, const char* szSrc) {
//AS4(szDst) = AS4(szSrc), AS2(szDst+4) = AS2(szSrc+4), AS1(szDst+6) = AS1(szSrc+6);
AS4(szDst) = AS4(szSrc), AS4(szDst+3) = AS4(szSrc+3);
}
Environment Details:
Windows Server 2012 R2 x64
Intel Xeon X5670
Assembly Reference:
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
-------------------------------------------------------
00007FF62D7911CC call CBench::CBench (07FF62D791000h)
00007FF62D7911D1 xor r8d,r8d
00007FF62D7911D4 lea r10,[_a (07FF62D794630h)]
00007FF62D7911DB mov r9d,1312D00h
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7911E1 mov rax,r8
00007FF62D7911E4 inc r8
00007FF62D7911E7 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7911EC lea rcx,[rax+rax*2]
00007FF62D7911F0 movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7911F6 mov edx,dword ptr [r10+rcx*8]
00007FF62D7911FA mov word ptr [rsp+44h],ax
00007FF62D7911FF movzx eax,byte ptr [r10+rcx*8+6]
00007FF62D791205 mov byte ptr [rsp+46h],al
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791209 mov eax,7
00007FF62D79120E crc32 eax,edx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D791213 mov dword ptr [buf],edx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791217 crc32 eax,dword ptr [rsp+43h]
00007FF62D79121E add ebx,eax
00007FF62D791220 sub r9,1
00007FF62D791224 jne Test_Func<0,7,0>+71h (07FF62D7911E1h)
}
return ii;
00007FF62D791226 lea rcx,[Bench]
00007FF62D79122B call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
-------------------------------------------------------
00007FF62D7912A9 call CBench::CBench (07FF62D791000h)
00007FF62D7912AE xor r8d,r8d
00007FF62D7912B1 lea r10,[_a (07FF62D794630h)]
00007FF62D7912B8 mov r9d,1312D00h
00007FF62D7912BE xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7912C0 mov rax,r8
00007FF62D7912C3 inc r8
00007FF62D7912C6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7912CB lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7912CF movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7912D5 movzx edx,byte ptr [r10+rcx*8+6]
00007FF62D7912DB shl rdx,10h
00007FF62D7912DF or rdx,rax
00007FF62D7912E2 mov eax,dword ptr [r10+rcx*8]
00007FF62D7912E6 shl rdx,20h
00007FF62D7912EA or rdx,rax
00007FF62D7912ED mov eax,7
00007FF62D7912F2 crc32 rax,rdx
00007FF62D7912F8 add ebx,eax
00007FF62D7912FA sub r9,1
00007FF62D7912FE jne Test_Func<1,7,0>+70h (07FF62D7912C0h)
}
return ii;
00007FF62D791300 lea rcx,[Bench]
00007FF62D791305 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
-------------------------------------------------------
00007FF62D791386 call CBench::CBench (07FF62D791000h)
00007FF62D79138B xor edx,edx
00007FF62D79138D lea r9,[_a (07FF62D794630h)]
00007FF62D791394 mov r8d,1312D00h
00007FF62D79139A nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913A3 mov ecx,7
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A8 and eax,3FFh
00007FF62D7913AD inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7913B0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913B4 crc32 ecx,dword ptr [r9+rax*8]
00007FF62D7913BB crc32 ecx,dword ptr [r9+rax*8+3]
00007FF62D7913C3 add ebx,ecx
00007FF62D7913C5 sub r8,1
00007FF62D7913C9 jne Test_Func<0,7,1>+70h (07FF62D7913A0h)
}
return ii;
00007FF62D7913CB lea rcx,[Bench]
00007FF62D7913D0 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
-------------------------------------------------------
00007FF62D791449 call CBench::CBench (07FF62D791000h)
00007FF62D79144E xor r8d,r8d
00007FF62D791451 lea r10,[_a (07FF62D794630h)]
00007FF62D791458 mov r9d,1312D00h
00007FF62D79145E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791460 mov rax,r8
00007FF62D791463 inc r8
00007FF62D791466 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79146B lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79146F movzx edx,byte ptr [r10+rax*8+6]
00007FF62D791475 lea rcx,[r10+rax*8]
00007FF62D791479 movzx eax,word ptr [r10+rax*8+4]
00007FF62D79147F shl rdx,10h
00007FF62D791483 or rdx,rax
00007FF62D791486 mov eax,dword ptr [rcx]
00007FF62D791488 shl rdx,20h
00007FF62D79148C or rdx,rax
00007FF62D79148F mov eax,7
00007FF62D791494 crc32 rax,rdx
00007FF62D79149A add ebx,eax
00007FF62D79149C sub r9,1
00007FF62D7914A0 jne Test_Func<1,7,1>+70h (07FF62D791460h)
}
return ii;
00007FF62D7914A2 lea rcx,[Bench]
00007FF62D7914A7 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
-------------------------------------------------------
00007FF62D791526 call CBench::CBench (07FF62D791000h)
00007FF62D79152B xor r8d,r8d
00007FF62D79152E lea r10,[_a (07FF62D794630h)]
00007FF62D791535 mov r9d,1312D00h
00007FF62D79153B nop dword ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791540 mov rax,r8
00007FF62D791543 inc r8
00007FF62D791546 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79154B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79154F mov eax,6
00007FF62D791554 crc32 eax,dword ptr [r10+rcx*8]
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79155B movzx edx,word ptr [r10+rcx*8+4]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791561 crc32 eax,dx
00007FF62D791567 add ebx,eax
00007FF62D791569 sub r9,1
00007FF62D79156D jne Test_Func<0,6,0>+70h (07FF62D791540h)
}
return ii;
00007FF62D79156F lea rcx,[Bench]
00007FF62D791574 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
-------------------------------------------------------
00007FF62D7915F9 call CBench::CBench (07FF62D791000h)
00007FF62D7915FE xor r8d,r8d
00007FF62D791601 lea r10,[_a (07FF62D794630h)]
00007FF62D791608 mov r9d,1312D00h
00007FF62D79160E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791610 mov rax,r8
00007FF62D791613 inc r8
00007FF62D791616 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79161B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79161F mov eax,dword ptr [r10+rcx*8]
00007FF62D791623 movzx edx,word ptr [r10+rcx*8+4]
00007FF62D791629 shl rdx,20h
00007FF62D79162D or rdx,rax
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791630 mov eax,6
00007FF62D791635 crc32 rax,rdx
00007FF62D79163B add ebx,eax
00007FF62D79163D sub r9,1
00007FF62D791641 jne Test_Func<1,6,0>+70h (07FF62D791610h)
}
return ii;
00007FF62D791643 lea rcx,[Bench]
00007FF62D791648 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
-------------------------------------------------------
00007FF62D7916C6 call CBench::CBench (07FF62D791000h)
00007FF62D7916CB xor edx,edx
00007FF62D7916CD lea r9,[_a (07FF62D794630h)]
00007FF62D7916D4 mov r8d,1312D00h
00007FF62D7916DA nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916E3 mov ecx,6
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E8 and eax,3FFh
00007FF62D7916ED inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7916F0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916F4 crc32 ecx,dword ptr [r9+rax*8]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916FB crc32 ecx,word ptr [r9+rax*8+4]
00007FF62D791704 add ebx,ecx
00007FF62D791706 sub r8,1
00007FF62D79170A jne Test_Func<0,6,1>+70h (07FF62D7916E0h)
}
return ii;
00007FF62D79170C lea rcx,[Bench]
00007FF62D791711 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
-------------------------------------------------------
00007FF62D791799 call CBench::CBench (07FF62D791000h)
00007FF62D79179E xor edx,edx
00007FF62D7917A0 lea r9,[_a (07FF62D794630h)]
00007FF62D7917A7 mov r8d,1312D00h
00007FF62D7917AD nop dword ptr [rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7917B0 mov rax,rdx
00007FF62D7917B3 inc rdx
00007FF62D7917B6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7917BB lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7917BF movzx ecx,word ptr [r9+rax*8+4]
00007FF62D7917C5 mov eax,dword ptr [r9+rax*8]
00007FF62D7917C9 shl rcx,20h
00007FF62D7917CD or rcx,rax
00007FF62D7917D0 mov eax,6
00007FF62D7917D5 crc32 rax,rcx
00007FF62D7917DB add ebx,eax
00007FF62D7917DD sub r8,1
00007FF62D7917E1 jne Test_Func<1,6,1>+70h (07FF62D7917B0h)
}
return ii;
00007FF62D7917E3 lea rcx,[Bench]
00007FF62D7917E8 call CBench::~CBench (07FF62D791030h)
Source Code:
#include <Windows.h>
#include "new"
#include <cstdio>
#include <intrin.h>
#define DimensionOf(x) (sizeof(x)/sizeof(*(x)))
#define INL __forceinline
#define NOINL __declspec(noinline)
#define PASSES 20000000
#define AS1(a_) (*(U1*)(a_))
#define AS2(a_) (*(U2*)(a_))
#define AS3(a_) ((U4(AS1((char*)(a_) + 2))<<16) | AS2(a_))
#define AS4(a_) (*(U4*)(a_))
#define AS6(a_) ((U8(AS2((char*)(a_) + 4))<<32) | AS4(a_))
#define AS7(a_) ((U8(AS3((char*)(a_) + 4))<<32) | AS4(a_))
typedef unsigned char U1;
typedef unsigned short U2;
typedef unsigned int U4;
typedef unsigned long long U8;
typedef char TData[24];
TData _a[0x400];
// CBench is for benchmarking code
class CBench {
__int64 m_nStart;
const char* m_desc;
public:
// No inline declared
// Reasoning: Simplifies the assembly code.
// Easier to see how the optimizer optimizes different variations of an algorithm.
NOINL CBench(const char *szDesc)
: m_desc(szDesc), m_nStart(GetBenchMark()) { }
NOINL ~CBench() {
__int64 cpuFreq, deltaTime(GetBenchMark() - m_nStart);
QueryPerformanceFrequency((LARGE_INTEGER*) &cpuFreq);
double execTimeInMS = ((double) deltaTime * 1000) / cpuFreq;
printf("%s:\t%10.4f ms\n", m_desc, execTimeInMS);
}
NOINL static __int64 GetBenchMark(void) {
__int64 nBenchMark;
QueryPerformanceCounter((LARGE_INTEGER*) &nBenchMark);
return nBenchMark;
}
};
// CFunc executes CRC32 intrinsics on 6 & 7 bytes in two different ways
template <int N>
struct CFunc {
char m_ach[N];
INL CFunc(const char* sz) {
memcpy(m_ach, sz, N);
}
INL U4 Case0() {
return (N == 7) ? _mm_crc32_u32(_mm_crc32_u32(N, AS4(m_ach)), AS4(m_ach + 3))
: _mm_crc32_u16(_mm_crc32_u32(N, AS4(m_ach)), AS2(m_ach + 4));
}
INL U4 Case1() {
return (N == 7) ? (U4) _mm_crc32_u64(N, AS7(m_ach))
: (U4) _mm_crc32_u64(N, AS6(m_ach));
}
};
// Evaluates performance dependent on:
// - CASE : CRC procedure
// - N : Number of bytes
// - USEREF : True, reference to pre-existing CFunc object
// False, constructing new CFunc object
template<U4 CASE, int N, bool USEREF>
NOINL int Test_Func(int ii) {
char szDesc[64], buf[64];
(USEREF) ? sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Reference", CASE, N)
: sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Construction", CASE, N);
CBench Bench(szDesc);
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
ii += (CASE == 1) ? x.Case1() : x.Case0();
}
return ii;
}
int main(int argc, char* argv[]) {
for (int i = 0; i < 10; ++i) {
printf("\n>>>>\tPass %d:\n", i);
// Execute CRC on 7 bytes
// Construct CFunc Object
argc = Test_Func<0, 7, false>(argc);
argc = Test_Func<1, 7, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 7, true>(argc);
argc = Test_Func<1, 7, true>(argc);
// Execute CRC on 6 bytes
// Construct CFunc Object
argc = Test_Func<0, 6, false>(argc);
argc = Test_Func<1, 6, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 6, true>(argc);
argc = Test_Func<1, 6, true>(argc);
}
printf("\n\nDone\n");
return argc;
}
编译器用于将数据复制到 7 字节缓冲区的操作以不同于 crc32 调用要求的方式填充寄存器。编译器必须去堆栈获取 crc32 调用所需的寄存器。没有不需要完全写入堆栈的 1、2、4 字节读取和写入的组合。当我将 7 个字节复制到 8 字节缓冲区,用第二个未对齐的 4 字节 mov 复制中间字节时,编译器看到 2 个寄存器已经为 crc32 调用填充并消除了堆栈 read/write。
125.997 毫秒:
使用 memcpy,它执行对齐复制和未对齐的 crc32:
memcpy(buf, _a[i], 7);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
movzx eax,word ptr [_a[i]+4]
mov edx,dword ptr [_a[i]]
mov word ptr [buf+4],ax
movzx eax,byte ptr [_a[i]+6]
mov byte ptr [buf+6],al
xor eax,eax
crc32 eax,edx
mov dword ptr [buf],edx
crc32 eax,dword ptr [buf+3]
第一次调用crc32可以使用副本中的寄存器edx,但是第二次调用没有准备好寄存器。它需要 DWORD、WORD 和 BYTE movs 到 buf 的结果。最重要的是,我怀疑编译器看到这里发生了一堆别名并变得保守。编译器别无选择,只能在堆栈上构建buf,然后访问它。
137.044 毫秒:
memcpy<7>,未对齐的重叠复制到 7 个 char buf,遇到同样的问题。复制步骤涉及的寄存器不是crc32步骤需要的寄存器。它有更多未对齐的访问,所以它慢了一点:
AS4(buf) = AS4(_a[i]), AS4(buf + 3) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
mov eax,dword ptr [_a[i]]
mov ecx,dword ptr [_a[i]+3]
mov dword ptr [buf],eax
xor eax,eax
mov dword ptr [buf+3],ecx
crc32 eax,dword ptr [buf]
crc32 eax,ecx
16.733 毫秒:
未对齐的重叠访问源但未重叠到 8 字节的目标缓冲区中,看到了巨大的改进!在这种情况下,我们复制中间字节两次,但我们从不为 buf 中的 DWORDS 设置别名。如果 _a[i] = "1234567",则 buf 将为 "12344567":
AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 4));
xor eax,eax
crc32 eax,dword ptr [_a[i]]
crc32 eax,dword ptr [_a[i]+3]
将第一个DWORD复制到buf的调用和将第二个DWORD复制到buf + 4的调用使用2个独立的寄存器,可以直接传递给crc32,所以不需要使用buf。后续传递中的优化器会注意到未使用的数据已移至堆栈并删除相关操作。
121.500 毫秒:
然后我在 8 个字符的 buf 上尝试了 64 位 crc,以与上面相同的方式构建,但损失很大。编译器没有使用单个 8 字节寄存器来移动到 buf。
AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u64(0, AS8(buf));
mov eax,dword ptr [_a[i]]
mov dword ptr [buf],eax
mov eax,dword ptr [_a[i]+3]
mov dword ptr [buf+3],eax
xor eax,eax
crc32 rax,qword ptr [buf]
20.799 毫秒:
我将 buf 的移动更改为 8 字节而不是 2 x 4 字节。这停止使用堆栈,但仍然低于上面的第三种方法:
AS8(buf) = AS4(_a[i]) | ((U8)AS4(_a[i] + 3) << 32);
ii += _mm_crc32_u64(0, AS8(buf));
mov ecx,dword ptr [_a[i]+3]
mov eax,dword ptr [_a[i]]
shl rcx,20h
or rcx,rax
xor eax,eax
crc32 rax,rcx
1 耗时:125.997 毫秒
2 花费:137.044 毫秒
3 花费:16.733 毫秒
4 花费:121.500 毫秒
5 花费:20.799 毫秒
下面的代码片段演示了一种情况,其中以两种不同的方式在 7 字节 数据上调用 CRC32 编译器内部函数(例如 case0() & case1()) 导致不同的编译器优化。编译器优化的这些差异会产生截然不同的执行时间(例如 [Test_Construction,案例:0,字节:7])。
作为参考,我包含了以相同方式对 6 字节数据调用 CRC32 的逻辑。但是,正如您从生成的输出中看到的那样,所产生的执行时间不会受到与处理 7 字节数据时相同的性能影响。
单遍的生成输出 - 对每个感兴趣的数据大小(6 和 7 字节)进行 4 次独特测试:
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
我有两个问题:
- Why is the compiler producing different optimizations (e.g. specifically in the case of [Test_Construction, Case: 0, Bytes: 7]?
- It looks like when [Test_Construction, Case: 0, Bytes: 7] is translated into machine code it contains additional instructions which move data from the stack into the registers and then back out on the stack. This does not seem to occur in any other scenario. CRC is then called once on data found within a register and once on data on the stack. Why would it do this?
- Why is the performance dropping in the first place?
- Is it due to the additional stack logic (memory operations) found in [Test_Construction, Case: 0, Bytes: 7] machine code?
- Could the order of the operations contribute?
- Is there a way to stop the optimizer from producing this suboptimal machine code?
更新 1 - 2017 年 4 月 7 日:
- @1201ProgramAlarm,johnnycrash
- 我只是想澄清一下,我想optimize/reduce生成机器码。我故意重叠 [Case: 0, Bytes: 7] 中的第 4 个字节,以便调用 CRC32_u32 两次以避免必须进行以下 3 次调用:CRC32_u32 + CRC32_u16 + CRC32_u8.
- 作为对 johnnycrash 的建议的跟进,我试图在 CFunc 的构造函数中完全删除对 memcpy 的调用,特别是在数据大小为 7 字节的情况下。直接看下面的代码。但是,这对执行时间没有影响。
.
template<int N>
void MemCpy(char* szDst, const char* szSrc) {
memcpy(szDst, szSrc, N);
}
// I tried both of these alternatives to memcpy, no luck.
template<> void MemCpy<7>(char* szDst, const char* szSrc) {
//AS4(szDst) = AS4(szSrc), AS2(szDst+4) = AS2(szSrc+4), AS1(szDst+6) = AS1(szSrc+6);
AS4(szDst) = AS4(szSrc), AS4(szDst+3) = AS4(szSrc+3);
}
Environment Details:
Windows Server 2012 R2 x64
Intel Xeon X5670
Assembly Reference:
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
-------------------------------------------------------
00007FF62D7911CC call CBench::CBench (07FF62D791000h)
00007FF62D7911D1 xor r8d,r8d
00007FF62D7911D4 lea r10,[_a (07FF62D794630h)]
00007FF62D7911DB mov r9d,1312D00h
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7911E1 mov rax,r8
00007FF62D7911E4 inc r8
00007FF62D7911E7 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7911EC lea rcx,[rax+rax*2]
00007FF62D7911F0 movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7911F6 mov edx,dword ptr [r10+rcx*8]
00007FF62D7911FA mov word ptr [rsp+44h],ax
00007FF62D7911FF movzx eax,byte ptr [r10+rcx*8+6]
00007FF62D791205 mov byte ptr [rsp+46h],al
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791209 mov eax,7
00007FF62D79120E crc32 eax,edx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D791213 mov dword ptr [buf],edx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791217 crc32 eax,dword ptr [rsp+43h]
00007FF62D79121E add ebx,eax
00007FF62D791220 sub r9,1
00007FF62D791224 jne Test_Func<0,7,0>+71h (07FF62D7911E1h)
}
return ii;
00007FF62D791226 lea rcx,[Bench]
00007FF62D79122B call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
-------------------------------------------------------
00007FF62D7912A9 call CBench::CBench (07FF62D791000h)
00007FF62D7912AE xor r8d,r8d
00007FF62D7912B1 lea r10,[_a (07FF62D794630h)]
00007FF62D7912B8 mov r9d,1312D00h
00007FF62D7912BE xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7912C0 mov rax,r8
00007FF62D7912C3 inc r8
00007FF62D7912C6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7912CB lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7912CF movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7912D5 movzx edx,byte ptr [r10+rcx*8+6]
00007FF62D7912DB shl rdx,10h
00007FF62D7912DF or rdx,rax
00007FF62D7912E2 mov eax,dword ptr [r10+rcx*8]
00007FF62D7912E6 shl rdx,20h
00007FF62D7912EA or rdx,rax
00007FF62D7912ED mov eax,7
00007FF62D7912F2 crc32 rax,rdx
00007FF62D7912F8 add ebx,eax
00007FF62D7912FA sub r9,1
00007FF62D7912FE jne Test_Func<1,7,0>+70h (07FF62D7912C0h)
}
return ii;
00007FF62D791300 lea rcx,[Bench]
00007FF62D791305 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
-------------------------------------------------------
00007FF62D791386 call CBench::CBench (07FF62D791000h)
00007FF62D79138B xor edx,edx
00007FF62D79138D lea r9,[_a (07FF62D794630h)]
00007FF62D791394 mov r8d,1312D00h
00007FF62D79139A nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913A3 mov ecx,7
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A8 and eax,3FFh
00007FF62D7913AD inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7913B0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913B4 crc32 ecx,dword ptr [r9+rax*8]
00007FF62D7913BB crc32 ecx,dword ptr [r9+rax*8+3]
00007FF62D7913C3 add ebx,ecx
00007FF62D7913C5 sub r8,1
00007FF62D7913C9 jne Test_Func<0,7,1>+70h (07FF62D7913A0h)
}
return ii;
00007FF62D7913CB lea rcx,[Bench]
00007FF62D7913D0 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
-------------------------------------------------------
00007FF62D791449 call CBench::CBench (07FF62D791000h)
00007FF62D79144E xor r8d,r8d
00007FF62D791451 lea r10,[_a (07FF62D794630h)]
00007FF62D791458 mov r9d,1312D00h
00007FF62D79145E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791460 mov rax,r8
00007FF62D791463 inc r8
00007FF62D791466 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79146B lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79146F movzx edx,byte ptr [r10+rax*8+6]
00007FF62D791475 lea rcx,[r10+rax*8]
00007FF62D791479 movzx eax,word ptr [r10+rax*8+4]
00007FF62D79147F shl rdx,10h
00007FF62D791483 or rdx,rax
00007FF62D791486 mov eax,dword ptr [rcx]
00007FF62D791488 shl rdx,20h
00007FF62D79148C or rdx,rax
00007FF62D79148F mov eax,7
00007FF62D791494 crc32 rax,rdx
00007FF62D79149A add ebx,eax
00007FF62D79149C sub r9,1
00007FF62D7914A0 jne Test_Func<1,7,1>+70h (07FF62D791460h)
}
return ii;
00007FF62D7914A2 lea rcx,[Bench]
00007FF62D7914A7 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
-------------------------------------------------------
00007FF62D791526 call CBench::CBench (07FF62D791000h)
00007FF62D79152B xor r8d,r8d
00007FF62D79152E lea r10,[_a (07FF62D794630h)]
00007FF62D791535 mov r9d,1312D00h
00007FF62D79153B nop dword ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791540 mov rax,r8
00007FF62D791543 inc r8
00007FF62D791546 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79154B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79154F mov eax,6
00007FF62D791554 crc32 eax,dword ptr [r10+rcx*8]
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79155B movzx edx,word ptr [r10+rcx*8+4]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791561 crc32 eax,dx
00007FF62D791567 add ebx,eax
00007FF62D791569 sub r9,1
00007FF62D79156D jne Test_Func<0,6,0>+70h (07FF62D791540h)
}
return ii;
00007FF62D79156F lea rcx,[Bench]
00007FF62D791574 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
-------------------------------------------------------
00007FF62D7915F9 call CBench::CBench (07FF62D791000h)
00007FF62D7915FE xor r8d,r8d
00007FF62D791601 lea r10,[_a (07FF62D794630h)]
00007FF62D791608 mov r9d,1312D00h
00007FF62D79160E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791610 mov rax,r8
00007FF62D791613 inc r8
00007FF62D791616 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79161B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79161F mov eax,dword ptr [r10+rcx*8]
00007FF62D791623 movzx edx,word ptr [r10+rcx*8+4]
00007FF62D791629 shl rdx,20h
00007FF62D79162D or rdx,rax
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791630 mov eax,6
00007FF62D791635 crc32 rax,rdx
00007FF62D79163B add ebx,eax
00007FF62D79163D sub r9,1
00007FF62D791641 jne Test_Func<1,6,0>+70h (07FF62D791610h)
}
return ii;
00007FF62D791643 lea rcx,[Bench]
00007FF62D791648 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
-------------------------------------------------------
00007FF62D7916C6 call CBench::CBench (07FF62D791000h)
00007FF62D7916CB xor edx,edx
00007FF62D7916CD lea r9,[_a (07FF62D794630h)]
00007FF62D7916D4 mov r8d,1312D00h
00007FF62D7916DA nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916E3 mov ecx,6
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E8 and eax,3FFh
00007FF62D7916ED inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7916F0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916F4 crc32 ecx,dword ptr [r9+rax*8]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916FB crc32 ecx,word ptr [r9+rax*8+4]
00007FF62D791704 add ebx,ecx
00007FF62D791706 sub r8,1
00007FF62D79170A jne Test_Func<0,6,1>+70h (07FF62D7916E0h)
}
return ii;
00007FF62D79170C lea rcx,[Bench]
00007FF62D791711 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
-------------------------------------------------------
00007FF62D791799 call CBench::CBench (07FF62D791000h)
00007FF62D79179E xor edx,edx
00007FF62D7917A0 lea r9,[_a (07FF62D794630h)]
00007FF62D7917A7 mov r8d,1312D00h
00007FF62D7917AD nop dword ptr [rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7917B0 mov rax,rdx
00007FF62D7917B3 inc rdx
00007FF62D7917B6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7917BB lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7917BF movzx ecx,word ptr [r9+rax*8+4]
00007FF62D7917C5 mov eax,dword ptr [r9+rax*8]
00007FF62D7917C9 shl rcx,20h
00007FF62D7917CD or rcx,rax
00007FF62D7917D0 mov eax,6
00007FF62D7917D5 crc32 rax,rcx
00007FF62D7917DB add ebx,eax
00007FF62D7917DD sub r8,1
00007FF62D7917E1 jne Test_Func<1,6,1>+70h (07FF62D7917B0h)
}
return ii;
00007FF62D7917E3 lea rcx,[Bench]
00007FF62D7917E8 call CBench::~CBench (07FF62D791030h)
Source Code:
#include <Windows.h>
#include "new"
#include <cstdio>
#include <intrin.h>
#define DimensionOf(x) (sizeof(x)/sizeof(*(x)))
#define INL __forceinline
#define NOINL __declspec(noinline)
#define PASSES 20000000
#define AS1(a_) (*(U1*)(a_))
#define AS2(a_) (*(U2*)(a_))
#define AS3(a_) ((U4(AS1((char*)(a_) + 2))<<16) | AS2(a_))
#define AS4(a_) (*(U4*)(a_))
#define AS6(a_) ((U8(AS2((char*)(a_) + 4))<<32) | AS4(a_))
#define AS7(a_) ((U8(AS3((char*)(a_) + 4))<<32) | AS4(a_))
typedef unsigned char U1;
typedef unsigned short U2;
typedef unsigned int U4;
typedef unsigned long long U8;
typedef char TData[24];
TData _a[0x400];
// CBench is for benchmarking code
class CBench {
__int64 m_nStart;
const char* m_desc;
public:
// No inline declared
// Reasoning: Simplifies the assembly code.
// Easier to see how the optimizer optimizes different variations of an algorithm.
NOINL CBench(const char *szDesc)
: m_desc(szDesc), m_nStart(GetBenchMark()) { }
NOINL ~CBench() {
__int64 cpuFreq, deltaTime(GetBenchMark() - m_nStart);
QueryPerformanceFrequency((LARGE_INTEGER*) &cpuFreq);
double execTimeInMS = ((double) deltaTime * 1000) / cpuFreq;
printf("%s:\t%10.4f ms\n", m_desc, execTimeInMS);
}
NOINL static __int64 GetBenchMark(void) {
__int64 nBenchMark;
QueryPerformanceCounter((LARGE_INTEGER*) &nBenchMark);
return nBenchMark;
}
};
// CFunc executes CRC32 intrinsics on 6 & 7 bytes in two different ways
template <int N>
struct CFunc {
char m_ach[N];
INL CFunc(const char* sz) {
memcpy(m_ach, sz, N);
}
INL U4 Case0() {
return (N == 7) ? _mm_crc32_u32(_mm_crc32_u32(N, AS4(m_ach)), AS4(m_ach + 3))
: _mm_crc32_u16(_mm_crc32_u32(N, AS4(m_ach)), AS2(m_ach + 4));
}
INL U4 Case1() {
return (N == 7) ? (U4) _mm_crc32_u64(N, AS7(m_ach))
: (U4) _mm_crc32_u64(N, AS6(m_ach));
}
};
// Evaluates performance dependent on:
// - CASE : CRC procedure
// - N : Number of bytes
// - USEREF : True, reference to pre-existing CFunc object
// False, constructing new CFunc object
template<U4 CASE, int N, bool USEREF>
NOINL int Test_Func(int ii) {
char szDesc[64], buf[64];
(USEREF) ? sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Reference", CASE, N)
: sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Construction", CASE, N);
CBench Bench(szDesc);
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
ii += (CASE == 1) ? x.Case1() : x.Case0();
}
return ii;
}
int main(int argc, char* argv[]) {
for (int i = 0; i < 10; ++i) {
printf("\n>>>>\tPass %d:\n", i);
// Execute CRC on 7 bytes
// Construct CFunc Object
argc = Test_Func<0, 7, false>(argc);
argc = Test_Func<1, 7, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 7, true>(argc);
argc = Test_Func<1, 7, true>(argc);
// Execute CRC on 6 bytes
// Construct CFunc Object
argc = Test_Func<0, 6, false>(argc);
argc = Test_Func<1, 6, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 6, true>(argc);
argc = Test_Func<1, 6, true>(argc);
}
printf("\n\nDone\n");
return argc;
}
编译器用于将数据复制到 7 字节缓冲区的操作以不同于 crc32 调用要求的方式填充寄存器。编译器必须去堆栈获取 crc32 调用所需的寄存器。没有不需要完全写入堆栈的 1、2、4 字节读取和写入的组合。当我将 7 个字节复制到 8 字节缓冲区,用第二个未对齐的 4 字节 mov 复制中间字节时,编译器看到 2 个寄存器已经为 crc32 调用填充并消除了堆栈 read/write。
125.997 毫秒: 使用 memcpy,它执行对齐复制和未对齐的 crc32:
memcpy(buf, _a[i], 7);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
movzx eax,word ptr [_a[i]+4]
mov edx,dword ptr [_a[i]]
mov word ptr [buf+4],ax
movzx eax,byte ptr [_a[i]+6]
mov byte ptr [buf+6],al
xor eax,eax
crc32 eax,edx
mov dword ptr [buf],edx
crc32 eax,dword ptr [buf+3]
第一次调用crc32可以使用副本中的寄存器edx,但是第二次调用没有准备好寄存器。它需要 DWORD、WORD 和 BYTE movs 到 buf 的结果。最重要的是,我怀疑编译器看到这里发生了一堆别名并变得保守。编译器别无选择,只能在堆栈上构建buf,然后访问它。
137.044 毫秒: memcpy<7>,未对齐的重叠复制到 7 个 char buf,遇到同样的问题。复制步骤涉及的寄存器不是crc32步骤需要的寄存器。它有更多未对齐的访问,所以它慢了一点:
AS4(buf) = AS4(_a[i]), AS4(buf + 3) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
mov eax,dword ptr [_a[i]]
mov ecx,dword ptr [_a[i]+3]
mov dword ptr [buf],eax
xor eax,eax
mov dword ptr [buf+3],ecx
crc32 eax,dword ptr [buf]
crc32 eax,ecx
16.733 毫秒: 未对齐的重叠访问源但未重叠到 8 字节的目标缓冲区中,看到了巨大的改进!在这种情况下,我们复制中间字节两次,但我们从不为 buf 中的 DWORDS 设置别名。如果 _a[i] = "1234567",则 buf 将为 "12344567":
AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 4));
xor eax,eax
crc32 eax,dword ptr [_a[i]]
crc32 eax,dword ptr [_a[i]+3]
将第一个DWORD复制到buf的调用和将第二个DWORD复制到buf + 4的调用使用2个独立的寄存器,可以直接传递给crc32,所以不需要使用buf。后续传递中的优化器会注意到未使用的数据已移至堆栈并删除相关操作。
121.500 毫秒: 然后我在 8 个字符的 buf 上尝试了 64 位 crc,以与上面相同的方式构建,但损失很大。编译器没有使用单个 8 字节寄存器来移动到 buf。
AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u64(0, AS8(buf));
mov eax,dword ptr [_a[i]]
mov dword ptr [buf],eax
mov eax,dword ptr [_a[i]+3]
mov dword ptr [buf+3],eax
xor eax,eax
crc32 rax,qword ptr [buf]
20.799 毫秒: 我将 buf 的移动更改为 8 字节而不是 2 x 4 字节。这停止使用堆栈,但仍然低于上面的第三种方法:
AS8(buf) = AS4(_a[i]) | ((U8)AS4(_a[i] + 3) << 32);
ii += _mm_crc32_u64(0, AS8(buf));
mov ecx,dword ptr [_a[i]+3]
mov eax,dword ptr [_a[i]]
shl rcx,20h
or rcx,rax
xor eax,eax
crc32 rax,rcx
1 耗时:125.997 毫秒 2 花费:137.044 毫秒 3 花费:16.733 毫秒 4 花费:121.500 毫秒 5 花费:20.799 毫秒