C 编译器循环展开说明

C compiler loop unrolling clarification

我无法理解 MSVC 编译器如何展开以下循环(抱歉我对汇编语言的理解很差):

#define NUM_ITERATIONS (1000 * 1000 * 1000)
double dummySum = 0;

for (int x = 0; x < NUM_ITERATIONS; x++) {
    if (x & 1) 
       dummySum += x;
}

这是生成的程序集:

00007FF7B4511070  xorps       xmm1,xmm1  
        double dummySum = 0;
00007FF7B4511073  mov         ecx,2  
00007FF7B4511078  nop         dword ptr [rax+rax]  
        if (x & 1) 
00007FF7B4511080  lea         eax,[rcx-2]  
00007FF7B4511083  mov         r8d,eax  
00007FF7B4511086  and         r8d,1  
00007FF7B451108A  je          someTest+28h (07FF7B4511098h)  
            dummySum += x;
00007FF7B451108C  movd        xmm0,eax  
00007FF7B4511090  cvtdq2pd    xmm0,xmm0  
00007FF7B4511094  addsd       xmm1,xmm0  
        if (x & 1) 
00007FF7B4511098  lea         edx,[rcx-1]  
00007FF7B451109B  and         edx,1  
00007FF7B451109E  je          someTest+3Fh (07FF7B45110AFh)  
            dummySum += x;
00007FF7B45110A0  lea         eax,[rcx-1]  
00007FF7B45110A3  movd        xmm0,eax  
00007FF7B45110A7  cvtdq2pd    xmm0,xmm0  
00007FF7B45110AB  addsd       xmm1,xmm0  
00007FF7B45110AF  test        r8d,r8d  
        if (x & 1) 
00007FF7B45110B2  je          someTest+50h (07FF7B45110C0h)  
            dummySum += x;
00007FF7B45110B4  movd        xmm0,ecx  
00007FF7B45110B8  cvtdq2pd    xmm0,xmm0  
00007FF7B45110BC  addsd       xmm1,xmm0  
00007FF7B45110C0  test        edx,edx  
        if (x & 1) 
00007FF7B45110C2  je          someTest+63h (07FF7B45110D3h)  
            dummySum += x;
00007FF7B45110C4  lea         eax,[rcx+1]  
00007FF7B45110C7  movd        xmm0,eax  
00007FF7B45110CB  cvtdq2pd    xmm0,xmm0  
00007FF7B45110CF  addsd       xmm1,xmm0  
00007FF7B45110D3  test        r8d,r8d  
        if (x & 1) 
00007FF7B45110D6  je          someTest+77h (07FF7B45110E7h)  
            dummySum += x;
00007FF7B45110D8  lea         eax,[rcx+2]  
00007FF7B45110DB  movd        xmm0,eax  
00007FF7B45110DF  cvtdq2pd    xmm0,xmm0  
00007FF7B45110E3  addsd       xmm1,xmm0  
00007FF7B45110E7  test        edx,edx  
        if (x & 1) 
00007FF7B45110E9  je          someTest+8Ah (07FF7B45110FAh)  
            dummySum += x;
00007FF7B45110EB  lea         eax,[rcx+3]  
00007FF7B45110EE  movd        xmm0,eax  
00007FF7B45110F2  cvtdq2pd    xmm0,xmm0  
00007FF7B45110F6  addsd       xmm1,xmm0  
00007FF7B45110FA  test        r8d,r8d  
        if (x & 1) 
00007FF7B45110FD  je          someTest+9Eh (07FF7B451110Eh)  
            dummySum += x;
00007FF7B45110FF  lea         eax,[rcx+4]  
00007FF7B4511102  movd        xmm0,eax  
00007FF7B4511106  cvtdq2pd    xmm0,xmm0  
00007FF7B451110A  addsd       xmm1,xmm0  
00007FF7B451110E  test        edx,edx  
        if (x & 1) 
00007FF7B4511110  je          someTest+0B1h (07FF7B4511121h)  
            dummySum += x;
00007FF7B4511112  lea         eax,[rcx+5]  
00007FF7B4511115  movd        xmm0,eax  
00007FF7B4511119  cvtdq2pd    xmm0,xmm0  
00007FF7B451111D  addsd       xmm1,xmm0  
00007FF7B4511121  test        r8d,r8d  
        if (x & 1) 
00007FF7B4511124  je          someTest+0C5h (07FF7B4511135h)  
            dummySum += x;
00007FF7B4511126  lea         eax,[rcx+6]  
00007FF7B4511129  movd        xmm0,eax  
00007FF7B451112D  cvtdq2pd    xmm0,xmm0  
00007FF7B4511131  addsd       xmm1,xmm0  
00007FF7B4511135  test        edx,edx  
        if (x & 1) 
00007FF7B4511137  je          someTest+0D8h (07FF7B4511148h)  
            dummySum += x;
00007FF7B4511139  lea         eax,[rcx+7]  
00007FF7B451113C  movd        xmm0,eax  
00007FF7B4511140  cvtdq2pd    xmm0,xmm0  
00007FF7B4511144  addsd       xmm1,xmm0  

    for (int x = 0; x < NUM_ITERATIONS; x++) {
00007FF7B4511148  add         ecx,0Ah  
00007FF7B451114B  lea         eax,[rcx-2]  
00007FF7B451114E  cmp         eax,3B9ACA00h  
00007FF7B4511153  jl          someTest+10h (07FF7B4511080h)  
    }

我理解这部分(循环的开始):

// if (x % 2 == 0) jump over the sumation

00007FF7B4511073  mov         ecx,2                          // ecx/rcx = 2
00007FF7B4511080  lea         eax,[rcx-2]                    // eax = rcx - 2
00007FF7B4511083  mov         r8d,eax                        // r8d = eax
00007FF7B4511086  and         r8d,1                          // r8x & 1
00007FF7B451108A  je          someTest+28h (07FF7B4511098h)  // jump if zero

// add double 

00007FF7B451108C  movd        xmm0,eax  
00007FF7B4511090  cvtdq2pd    xmm0,xmm0  
00007FF7B4511094  addsd       xmm1,xmm0  

但我不明白后续的跳转指令如何跳过下一条 lea 指令,如果我查看地址(这是假设发生了跳转)- 注意我在上面的列表中省略了跳转之间的指令:

00007FF7B45110C0  test        edx,edx  
00007FF7B45110C2  je          someTest+63h (07FF7B45110D3h) 

... addresses in between omitted ...

00007FF7B45110D3  test        r8d,r8d  
00007FF7B45110D6  je          someTest+77h (07FF7B45110E7h)  

... addresses in between omitted ...

00007FF7B45110E7  test        edx,edx  
00007FF7B45110E9  je          someTest+8Ah (07FF7B45110FAh)  

... addresses in between omitted ...

00007FF7B45110FA  test        r8d,r8d  
00007FF7B45110FD  je          someTest+9Eh (07FF7B451110Eh)  

... addresses in between omitted ...

00007FF7B451110E  test        edx,edx  
00007FF7B4511110  je          someTest+0B1h (07FF7B4511121h)  

如果每次跳转都发生,似乎只是交替 test r8d,r8dtest edx,edx 指令,而不加载下一个值。

我在这里解释错了什么?

好的,知道了,我一步一步的拆解了;编译器相当聪明。循环展开为每次迭代执行 10 次,这些指令的排列方式是 r8dedx 每次迭代仅加载一次 :

lea         eax,[rcx-2]  
mov         r8d,eax  
and         r8d,1        // r8d is 0 here
...
lea         edx,[rcx-1]  
and         edx,1        // edx is 1 here

在那之后,这些寄存器不会在迭代的其余部分再次加载,因为编译器显然意识到 & 1 在每个奇数步骤上的计算结果为真:

00007FF7B45110C0  test        edx,edx  // always 1
00007FF7B45110C2  je          someTest+63h (07FF7B45110D3h) 

... addresses in between omitted ...

00007FF7B45110D3  test        r8d,r8d  // always 0
00007FF7B45110D6  je          someTest+77h (07FF7B45110E7h)  

... addresses in between omitted ...

00007FF7B45110E7  test        edx,edx  // always 1
00007FF7B45110E9  je          someTest+8Ah (07FF7B45110FAh)  

... addresses in between omitted ...

00007FF7B45110FA  test        r8d,r8d  // always 0
00007FF7B45110FD  je          someTest+9Eh (07FF7B451110Eh)  

... addresses in between omitted ...

00007FF7B451110E  test        edx,edx  // always 1
00007FF7B4511110  je          someTest+0B1h (07FF7B4511121h)