为什么编译器以不同的方式优化这些情况?

Why is the compiler optimizing these cases differently?

下面的代码片段演示了一种情况,其中以两种不同的方式在 7 字节 数据上调用 CRC32 编译器内部函数(例如 case0() & case1()) 导致不同的编译器优化。编译器优化的这些差异会产生截然不同的执行时间(例如 [Test_Construction,案例:0,字节:7])。

作为参考,我包含了以相同方式对 6 字节数据调用 CRC32 的逻辑。但是,正如您从生成的输出中看到的那样,所产生的执行时间不会受到与处理 7 字节数据时相同的性能影响。

单遍的生成输出 - 对每个感兴趣的数据大小(6 和 7 字节)进行 4 次独特测试:

Test_Construction <Case: 0, Bytes: 7>:    139.5543 ms
Test_Construction <Case: 1, Bytes: 7>:     38.6545 ms
Test_Reference    <Case: 0, Bytes: 7>:     26.2616 ms
Test_Reference    <Case: 1, Bytes: 7>:     38.8118 ms
Test_Construction <Case: 0, Bytes: 6>:     26.2925 ms
Test_Construction <Case: 1, Bytes: 6>:     29.5819 ms
Test_Reference    <Case: 0, Bytes: 6>:     25.3754 ms
Test_Reference    <Case: 1, Bytes: 6>:     28.7829 ms

我有两个问题:

  1. Why is the compiler producing different optimizations (e.g. specifically in the case of [Test_Construction, Case: 0, Bytes: 7]?
    • It looks like when [Test_Construction, Case: 0, Bytes: 7] is translated into machine code it contains additional instructions which move data from the stack into the registers and then back out on the stack. This does not seem to occur in any other scenario. CRC is then called once on data found within a register and once on data on the stack. Why would it do this?
  2. Why is the performance dropping in the first place?
    • Is it due to the additional stack logic (memory operations) found in [Test_Construction, Case: 0, Bytes: 7] machine code?
    • Could the order of the operations contribute?
    • Is there a way to stop the optimizer from producing this suboptimal machine code?

更新 1 - 2017 年 4 月 7 日:

.

template<int N>
void MemCpy(char* szDst, const char* szSrc) {
    memcpy(szDst, szSrc, N);
}

// I tried both of these alternatives to memcpy, no luck.
template<> void MemCpy<7>(char* szDst, const char* szSrc) {
    //AS4(szDst) = AS4(szSrc), AS2(szDst+4) = AS2(szSrc+4),  AS1(szDst+6) = AS1(szSrc+6);
    AS4(szDst) = AS4(szSrc), AS4(szDst+3) = AS4(szSrc+3);
}

Environment Details:

Windows Server 2012 R2 x64
Intel Xeon X5670

Assembly Reference:

-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 7>:    139.5543 ms
-------------------------------------------------------
00007FF62D7911CC  call        CBench::CBench (07FF62D791000h)  
00007FF62D7911D1  xor         r8d,r8d  
00007FF62D7911D4  lea         r10,[_a (07FF62D794630h)]  
00007FF62D7911DB  mov         r9d,1312D00h  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7911E1  mov         rax,r8  
00007FF62D7911E4  inc         r8  
00007FF62D7911E7  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7911EC  lea         rcx,[rax+rax*2]  
00007FF62D7911F0  movzx       eax,word ptr [r10+rcx*8+4]  
00007FF62D7911F6  mov         edx,dword ptr [r10+rcx*8]  
00007FF62D7911FA  mov         word ptr [rsp+44h],ax  
00007FF62D7911FF  movzx       eax,byte ptr [r10+rcx*8+6]  
00007FF62D791205  mov         byte ptr [rsp+46h],al  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791209  mov         eax,7  
00007FF62D79120E  crc32       eax,edx  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D791213  mov         dword ptr [buf],edx  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791217  crc32       eax,dword ptr [rsp+43h]  
00007FF62D79121E  add         ebx,eax  
00007FF62D791220  sub         r9,1  
00007FF62D791224  jne         Test_Func<0,7,0>+71h (07FF62D7911E1h)  
                }
                return ii;
00007FF62D791226  lea         rcx,[Bench]  
00007FF62D79122B  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 7>:     38.6545 ms
-------------------------------------------------------
00007FF62D7912A9  call        CBench::CBench (07FF62D791000h)  
00007FF62D7912AE  xor         r8d,r8d  
00007FF62D7912B1  lea         r10,[_a (07FF62D794630h)]  
00007FF62D7912B8  mov         r9d,1312D00h  
00007FF62D7912BE  xchg        ax,ax  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7912C0  mov         rax,r8  
00007FF62D7912C3  inc         r8  
00007FF62D7912C6  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7912CB  lea         rcx,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7912CF  movzx       eax,word ptr [r10+rcx*8+4]  
00007FF62D7912D5  movzx       edx,byte ptr [r10+rcx*8+6]  
00007FF62D7912DB  shl         rdx,10h  
00007FF62D7912DF  or          rdx,rax  
00007FF62D7912E2  mov         eax,dword ptr [r10+rcx*8]  
00007FF62D7912E6  shl         rdx,20h  
00007FF62D7912EA  or          rdx,rax  
00007FF62D7912ED  mov         eax,7  
00007FF62D7912F2  crc32       rax,rdx  
00007FF62D7912F8  add         ebx,eax  
00007FF62D7912FA  sub         r9,1  
00007FF62D7912FE  jne         Test_Func<1,7,0>+70h (07FF62D7912C0h)  
                }
                return ii;
00007FF62D791300  lea         rcx,[Bench]  
00007FF62D791305  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Reference    <Case: 0, Bytes: 7>:     26.2616 ms
-------------------------------------------------------
00007FF62D791386  call        CBench::CBench (07FF62D791000h)  
00007FF62D79138B  xor         edx,edx  
00007FF62D79138D  lea         r9,[_a (07FF62D794630h)]  
00007FF62D791394  mov         r8d,1312D00h  
00007FF62D79139A  nop         word ptr [rax+rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A0  mov         rax,rdx  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913A3  mov         ecx,7  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A8  and         eax,3FFh  
00007FF62D7913AD  inc         rdx  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7913B0  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913B4  crc32       ecx,dword ptr [r9+rax*8]  
00007FF62D7913BB  crc32       ecx,dword ptr [r9+rax*8+3]  
00007FF62D7913C3  add         ebx,ecx  
00007FF62D7913C5  sub         r8,1  
00007FF62D7913C9  jne         Test_Func<0,7,1>+70h (07FF62D7913A0h)  
                }
                return ii;
00007FF62D7913CB  lea         rcx,[Bench]  
00007FF62D7913D0  call        CBench::~CBench (07FF62D791030h)  


-------------------------------------------------------    
Test_Reference    <Case: 1, Bytes: 7>:     38.8118 ms
-------------------------------------------------------
00007FF62D791449  call        CBench::CBench (07FF62D791000h)  
00007FF62D79144E  xor         r8d,r8d  
00007FF62D791451  lea         r10,[_a (07FF62D794630h)]  
00007FF62D791458  mov         r9d,1312D00h  
00007FF62D79145E  xchg        ax,ax  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791460  mov         rax,r8  
00007FF62D791463  inc         r8  
00007FF62D791466  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79146B  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79146F  movzx       edx,byte ptr [r10+rax*8+6]  
00007FF62D791475  lea         rcx,[r10+rax*8]  
00007FF62D791479  movzx       eax,word ptr [r10+rax*8+4]  
00007FF62D79147F  shl         rdx,10h  
00007FF62D791483  or          rdx,rax  
00007FF62D791486  mov         eax,dword ptr [rcx]  
00007FF62D791488  shl         rdx,20h  
00007FF62D79148C  or          rdx,rax  
00007FF62D79148F  mov         eax,7  
00007FF62D791494  crc32       rax,rdx  
00007FF62D79149A  add         ebx,eax  
00007FF62D79149C  sub         r9,1  
00007FF62D7914A0  jne         Test_Func<1,7,1>+70h (07FF62D791460h)  
                }
                return ii;
00007FF62D7914A2  lea         rcx,[Bench]  
00007FF62D7914A7  call        CBench::~CBench (07FF62D791030h) 


-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 6>:     26.2925 ms
-------------------------------------------------------
00007FF62D791526  call        CBench::CBench (07FF62D791000h)  
00007FF62D79152B  xor         r8d,r8d  
00007FF62D79152E  lea         r10,[_a (07FF62D794630h)]  
00007FF62D791535  mov         r9d,1312D00h  
00007FF62D79153B  nop         dword ptr [rax+rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791540  mov         rax,r8  
00007FF62D791543  inc         r8  
00007FF62D791546  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79154B  lea         rcx,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79154F  mov         eax,6  
00007FF62D791554  crc32       eax,dword ptr [r10+rcx*8]  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79155B  movzx       edx,word ptr [r10+rcx*8+4]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791561  crc32       eax,dx  
00007FF62D791567  add         ebx,eax  
00007FF62D791569  sub         r9,1  
00007FF62D79156D  jne         Test_Func<0,6,0>+70h (07FF62D791540h)  
                }
                return ii;
00007FF62D79156F  lea         rcx,[Bench]  
00007FF62D791574  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 6>:     29.5819 ms
-------------------------------------------------------
00007FF62D7915F9  call        CBench::CBench (07FF62D791000h)  
00007FF62D7915FE  xor         r8d,r8d  
00007FF62D791601  lea         r10,[_a (07FF62D794630h)]  
00007FF62D791608  mov         r9d,1312D00h  
00007FF62D79160E  xchg        ax,ax  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791610  mov         rax,r8  
00007FF62D791613  inc         r8  
00007FF62D791616  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79161B  lea         rcx,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79161F  mov         eax,dword ptr [r10+rcx*8]  
00007FF62D791623  movzx       edx,word ptr [r10+rcx*8+4]  
00007FF62D791629  shl         rdx,20h  
00007FF62D79162D  or          rdx,rax  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791630  mov         eax,6  
00007FF62D791635  crc32       rax,rdx  
00007FF62D79163B  add         ebx,eax  
00007FF62D79163D  sub         r9,1  
00007FF62D791641  jne         Test_Func<1,6,0>+70h (07FF62D791610h)  
                }
                return ii;
00007FF62D791643  lea         rcx,[Bench]  
00007FF62D791648  call        CBench::~CBench (07FF62D791030h) 


-------------------------------------------------------
Test_Reference    <Case: 0, Bytes: 6>:     25.3754 ms
-------------------------------------------------------
00007FF62D7916C6  call        CBench::CBench (07FF62D791000h)  
00007FF62D7916CB  xor         edx,edx  
00007FF62D7916CD  lea         r9,[_a (07FF62D794630h)]  
00007FF62D7916D4  mov         r8d,1312D00h  
00007FF62D7916DA  nop         word ptr [rax+rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E0  mov         rax,rdx  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916E3  mov         ecx,6  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E8  and         eax,3FFh  
00007FF62D7916ED  inc         rdx  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7916F0  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916F4  crc32       ecx,dword ptr [r9+rax*8]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916FB  crc32       ecx,word ptr [r9+rax*8+4]  
00007FF62D791704  add         ebx,ecx  
00007FF62D791706  sub         r8,1  
00007FF62D79170A  jne         Test_Func<0,6,1>+70h (07FF62D7916E0h)  
                }
                return ii;
00007FF62D79170C  lea         rcx,[Bench]  
00007FF62D791711  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Reference    <Case: 1, Bytes: 6>:     28.7829 ms
-------------------------------------------------------
00007FF62D791799  call        CBench::CBench (07FF62D791000h)  
00007FF62D79179E  xor         edx,edx  
00007FF62D7917A0  lea         r9,[_a (07FF62D794630h)]  
00007FF62D7917A7  mov         r8d,1312D00h  
00007FF62D7917AD  nop         dword ptr [rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7917B0  mov         rax,rdx  
00007FF62D7917B3  inc         rdx  
00007FF62D7917B6  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7917BB  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7917BF  movzx       ecx,word ptr [r9+rax*8+4]  
00007FF62D7917C5  mov         eax,dword ptr [r9+rax*8]  
00007FF62D7917C9  shl         rcx,20h  
00007FF62D7917CD  or          rcx,rax  
00007FF62D7917D0  mov         eax,6  
00007FF62D7917D5  crc32       rax,rcx  
00007FF62D7917DB  add         ebx,eax  
00007FF62D7917DD  sub         r8,1  
00007FF62D7917E1  jne         Test_Func<1,6,1>+70h (07FF62D7917B0h)  
                }
                return ii;
00007FF62D7917E3  lea         rcx,[Bench]  
00007FF62D7917E8  call        CBench::~CBench (07FF62D791030h)

Source Code:

#include <Windows.h>
#include "new"
#include <cstdio>
#include <intrin.h>

#define DimensionOf(x)      (sizeof(x)/sizeof(*(x)))
#define INL                 __forceinline
#define NOINL               __declspec(noinline)
#define PASSES              20000000
#define AS1(a_)             (*(U1*)(a_))
#define AS2(a_)             (*(U2*)(a_))
#define AS3(a_)             ((U4(AS1((char*)(a_) + 2))<<16) | AS2(a_))
#define AS4(a_)             (*(U4*)(a_))
#define AS6(a_)             ((U8(AS2((char*)(a_) + 4))<<32) | AS4(a_))
#define AS7(a_)             ((U8(AS3((char*)(a_) + 4))<<32) | AS4(a_))

typedef unsigned char       U1;
typedef unsigned short      U2;
typedef unsigned int        U4;
typedef unsigned long long  U8;

typedef char TData[24];
TData _a[0x400];

// CBench is for benchmarking code
class CBench {
    __int64     m_nStart;
    const char* m_desc;
public:
    // No inline declared 
    // Reasoning:   Simplifies the assembly code. 
    //              Easier to see how the optimizer optimizes different variations of an algorithm.
    NOINL CBench(const char *szDesc) 
        : m_desc(szDesc), m_nStart(GetBenchMark()) { }

    NOINL ~CBench() {
        __int64 cpuFreq, deltaTime(GetBenchMark() - m_nStart);
        QueryPerformanceFrequency((LARGE_INTEGER*) &cpuFreq);
        double execTimeInMS = ((double) deltaTime * 1000) / cpuFreq;
        printf("%s:\t%10.4f ms\n", m_desc, execTimeInMS);
    }

    NOINL static __int64 GetBenchMark(void) {
        __int64 nBenchMark;
        QueryPerformanceCounter((LARGE_INTEGER*) &nBenchMark);
        return nBenchMark;
    }
};

// CFunc executes CRC32 intrinsics on 6 & 7 bytes in two different ways
template <int N>
struct CFunc {
    char m_ach[N];
    INL CFunc(const char* sz) {
        memcpy(m_ach, sz, N);
    }
    INL U4 Case0() {
        return (N == 7) ? _mm_crc32_u32(_mm_crc32_u32(N, AS4(m_ach)), AS4(m_ach + 3))
                        : _mm_crc32_u16(_mm_crc32_u32(N, AS4(m_ach)), AS2(m_ach + 4));
    }
    INL U4 Case1() {
        return (N == 7) ? (U4) _mm_crc32_u64(N, AS7(m_ach))
                        : (U4) _mm_crc32_u64(N, AS6(m_ach));
    }

};

// Evaluates performance dependent on:
//  -   CASE    :   CRC procedure
//  -   N       :   Number of bytes
//  -   USEREF  :   True,   reference to pre-existing CFunc object
//                  False,  constructing new CFunc object
template<U4 CASE, int N, bool USEREF>
NOINL int Test_Func(int ii) {
    char szDesc[64], buf[64];
    (USEREF) ? sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Reference", CASE, N) 
             : sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Construction", CASE, N);
    CBench Bench(szDesc);
    for (int iPass = 0; iPass < PASSES; ++iPass) {
        int i = iPass & (DimensionOf(_a) - 1);
        auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
        ii += (CASE == 1) ? x.Case1() : x.Case0();
    }
    return ii;
}

int main(int argc, char* argv[]) {
    for (int i = 0; i < 10; ++i) {
        printf("\n>>>>\tPass %d:\n", i);
        // Execute CRC on 7 bytes
        // Construct CFunc Object
        argc = Test_Func<0, 7, false>(argc);
        argc = Test_Func<1, 7, false>(argc);
        // Reference pre-existing CFunc Object
        argc = Test_Func<0, 7, true>(argc);
        argc = Test_Func<1, 7, true>(argc);

        // Execute CRC on 6 bytes
        // Construct CFunc Object
        argc = Test_Func<0, 6, false>(argc);
        argc = Test_Func<1, 6, false>(argc);
        // Reference pre-existing CFunc Object
        argc = Test_Func<0, 6, true>(argc);
        argc = Test_Func<1, 6, true>(argc);
    }
    printf("\n\nDone\n");
    return argc;
}

编译器用于将数据复制到 7 字节缓冲区的操作以不同于 crc32 调用要求的方式填充寄存器。编译器必须去堆栈获取 crc32 调用所需的寄存器。没有不需要完全写入堆栈的 1、2、4 字节读取和写入的组合。当我将 7 个字节复制到 8 字节缓冲区,用第二个未对齐的 4 字节 mov 复制中间字节时,编译器看到 2 个寄存器已经为 crc32 调用填充并消除了堆栈 read/write。

125.997 毫秒: 使用 memcpy,它执行对齐复制和未对齐的 crc32:

memcpy(buf, _a[i], 7);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
    movzx       eax,word ptr [_a[i]+4]  
    mov         edx,dword ptr [_a[i]]  
    mov         word ptr [buf+4],ax  
    movzx       eax,byte ptr [_a[i]+6]  
    mov         byte ptr [buf+6],al  
    xor         eax,eax  
    crc32       eax,edx  
    mov         dword ptr [buf],edx  
    crc32       eax,dword ptr [buf+3]  

第一次调用crc32可以使用副本中的寄存器edx,但是第二次调用没有准备好寄存器。它需要 DWORD、WORD 和 BYTE movs 到 buf 的结果。最重要的是,我怀疑编译器看到这里发生了一堆别名并变得保守。编译器别无选择,只能在堆栈上构建buf,然后访问它。

137.044 毫秒: memcpy<7>,未对齐的重叠复制到 7 个 char buf,遇到同样的问题。复制步骤涉及的寄存器不是crc32步骤需要的寄存器。它有更多未对齐的访问,所以它慢了一点:

AS4(buf) = AS4(_a[i]), AS4(buf + 3) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
    mov         eax,dword ptr [_a[i]]  
    mov         ecx,dword ptr [_a[i]+3]  
    mov         dword ptr [buf],eax  
    xor         eax,eax  
    mov         dword ptr [buf+3],ecx  
    crc32       eax,dword ptr [buf]  
    crc32       eax,ecx 

16.733 毫秒: 未对齐的重叠访问源但未重叠到 8 字节的目标缓冲区中,看到了巨大的改进!在这种情况下,我们复制中间字节两次,但我们从不为 buf 中的 DWORDS 设置别名。如果 _a[i] = "1234567",则 buf 将为 "12344567":

AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 4));
    xor         eax,eax  
    crc32       eax,dword ptr [_a[i]]  
    crc32       eax,dword ptr [_a[i]+3] 

将第一个DWORD复制到buf的调用和将第二个DWORD复制到buf + 4的调用使用2个独立的寄存器,可以直接传递给crc32,所以不需要使用buf。后续传递中的优化器会注意到未使用的数据已移至堆栈并删除相关操作。

121.500 毫秒: 然后我在 8 个字符的 buf 上尝试了 64 位 crc,以与上面相同的方式构建,但损失很大。编译器没有使用单个 8 字节寄存器来移动到 buf。

AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u64(0, AS8(buf));
    mov         eax,dword ptr [_a[i]]  
    mov         dword ptr [buf],eax  
    mov         eax,dword ptr [_a[i]+3]  
    mov         dword ptr [buf+3],eax  
    xor         eax,eax  
    crc32       rax,qword ptr [buf]

20.799 毫秒: 我将 buf 的移动更改为 8 字节而不是 2 x 4 字节。这停止使用堆栈,但仍然低于上面的第三种方法:

AS8(buf) = AS4(_a[i]) | ((U8)AS4(_a[i] + 3) << 32);
ii += _mm_crc32_u64(0, AS8(buf));
    mov         ecx,dword ptr [_a[i]+3]  
    mov         eax,dword ptr [_a[i]]  
    shl         rcx,20h  
    or          rcx,rax  
    xor         eax,eax  
    crc32       rax,rcx 

1 耗时:125.997 毫秒 2 花费:137.044 毫秒 3 花费:16.733 毫秒 4 花费:121.500 毫秒 5 花费:20.799 毫秒