C 编译器循环展开说明
C compiler loop unrolling clarification
我无法理解 MSVC 编译器如何展开以下循环(抱歉我对汇编语言的理解很差):
#define NUM_ITERATIONS (1000 * 1000 * 1000)
double dummySum = 0;
for (int x = 0; x < NUM_ITERATIONS; x++) {
if (x & 1)
dummySum += x;
}
这是生成的程序集:
00007FF7B4511070 xorps xmm1,xmm1
double dummySum = 0;
00007FF7B4511073 mov ecx,2
00007FF7B4511078 nop dword ptr [rax+rax]
if (x & 1)
00007FF7B4511080 lea eax,[rcx-2]
00007FF7B4511083 mov r8d,eax
00007FF7B4511086 and r8d,1
00007FF7B451108A je someTest+28h (07FF7B4511098h)
dummySum += x;
00007FF7B451108C movd xmm0,eax
00007FF7B4511090 cvtdq2pd xmm0,xmm0
00007FF7B4511094 addsd xmm1,xmm0
if (x & 1)
00007FF7B4511098 lea edx,[rcx-1]
00007FF7B451109B and edx,1
00007FF7B451109E je someTest+3Fh (07FF7B45110AFh)
dummySum += x;
00007FF7B45110A0 lea eax,[rcx-1]
00007FF7B45110A3 movd xmm0,eax
00007FF7B45110A7 cvtdq2pd xmm0,xmm0
00007FF7B45110AB addsd xmm1,xmm0
00007FF7B45110AF test r8d,r8d
if (x & 1)
00007FF7B45110B2 je someTest+50h (07FF7B45110C0h)
dummySum += x;
00007FF7B45110B4 movd xmm0,ecx
00007FF7B45110B8 cvtdq2pd xmm0,xmm0
00007FF7B45110BC addsd xmm1,xmm0
00007FF7B45110C0 test edx,edx
if (x & 1)
00007FF7B45110C2 je someTest+63h (07FF7B45110D3h)
dummySum += x;
00007FF7B45110C4 lea eax,[rcx+1]
00007FF7B45110C7 movd xmm0,eax
00007FF7B45110CB cvtdq2pd xmm0,xmm0
00007FF7B45110CF addsd xmm1,xmm0
00007FF7B45110D3 test r8d,r8d
if (x & 1)
00007FF7B45110D6 je someTest+77h (07FF7B45110E7h)
dummySum += x;
00007FF7B45110D8 lea eax,[rcx+2]
00007FF7B45110DB movd xmm0,eax
00007FF7B45110DF cvtdq2pd xmm0,xmm0
00007FF7B45110E3 addsd xmm1,xmm0
00007FF7B45110E7 test edx,edx
if (x & 1)
00007FF7B45110E9 je someTest+8Ah (07FF7B45110FAh)
dummySum += x;
00007FF7B45110EB lea eax,[rcx+3]
00007FF7B45110EE movd xmm0,eax
00007FF7B45110F2 cvtdq2pd xmm0,xmm0
00007FF7B45110F6 addsd xmm1,xmm0
00007FF7B45110FA test r8d,r8d
if (x & 1)
00007FF7B45110FD je someTest+9Eh (07FF7B451110Eh)
dummySum += x;
00007FF7B45110FF lea eax,[rcx+4]
00007FF7B4511102 movd xmm0,eax
00007FF7B4511106 cvtdq2pd xmm0,xmm0
00007FF7B451110A addsd xmm1,xmm0
00007FF7B451110E test edx,edx
if (x & 1)
00007FF7B4511110 je someTest+0B1h (07FF7B4511121h)
dummySum += x;
00007FF7B4511112 lea eax,[rcx+5]
00007FF7B4511115 movd xmm0,eax
00007FF7B4511119 cvtdq2pd xmm0,xmm0
00007FF7B451111D addsd xmm1,xmm0
00007FF7B4511121 test r8d,r8d
if (x & 1)
00007FF7B4511124 je someTest+0C5h (07FF7B4511135h)
dummySum += x;
00007FF7B4511126 lea eax,[rcx+6]
00007FF7B4511129 movd xmm0,eax
00007FF7B451112D cvtdq2pd xmm0,xmm0
00007FF7B4511131 addsd xmm1,xmm0
00007FF7B4511135 test edx,edx
if (x & 1)
00007FF7B4511137 je someTest+0D8h (07FF7B4511148h)
dummySum += x;
00007FF7B4511139 lea eax,[rcx+7]
00007FF7B451113C movd xmm0,eax
00007FF7B4511140 cvtdq2pd xmm0,xmm0
00007FF7B4511144 addsd xmm1,xmm0
for (int x = 0; x < NUM_ITERATIONS; x++) {
00007FF7B4511148 add ecx,0Ah
00007FF7B451114B lea eax,[rcx-2]
00007FF7B451114E cmp eax,3B9ACA00h
00007FF7B4511153 jl someTest+10h (07FF7B4511080h)
}
我理解这部分(循环的开始):
// if (x % 2 == 0) jump over the sumation
00007FF7B4511073 mov ecx,2 // ecx/rcx = 2
00007FF7B4511080 lea eax,[rcx-2] // eax = rcx - 2
00007FF7B4511083 mov r8d,eax // r8d = eax
00007FF7B4511086 and r8d,1 // r8x & 1
00007FF7B451108A je someTest+28h (07FF7B4511098h) // jump if zero
// add double
00007FF7B451108C movd xmm0,eax
00007FF7B4511090 cvtdq2pd xmm0,xmm0
00007FF7B4511094 addsd xmm1,xmm0
但我不明白后续的跳转指令如何跳过下一条 lea
指令,如果我查看地址(这是假设发生了跳转)- 注意我在上面的列表中省略了跳转之间的指令:
00007FF7B45110C0 test edx,edx
00007FF7B45110C2 je someTest+63h (07FF7B45110D3h)
... addresses in between omitted ...
00007FF7B45110D3 test r8d,r8d
00007FF7B45110D6 je someTest+77h (07FF7B45110E7h)
... addresses in between omitted ...
00007FF7B45110E7 test edx,edx
00007FF7B45110E9 je someTest+8Ah (07FF7B45110FAh)
... addresses in between omitted ...
00007FF7B45110FA test r8d,r8d
00007FF7B45110FD je someTest+9Eh (07FF7B451110Eh)
... addresses in between omitted ...
00007FF7B451110E test edx,edx
00007FF7B4511110 je someTest+0B1h (07FF7B4511121h)
如果每次跳转都发生,似乎只是交替 test r8d,r8d
和 test edx,edx
指令,而不加载下一个值。
我在这里解释错了什么?
好的,知道了,我一步一步的拆解了;编译器相当聪明。循环展开为每次迭代执行 10 次,这些指令的排列方式是 r8d
和 edx
每次迭代仅加载一次 :
lea eax,[rcx-2]
mov r8d,eax
and r8d,1 // r8d is 0 here
...
lea edx,[rcx-1]
and edx,1 // edx is 1 here
在那之后,这些寄存器不会在迭代的其余部分再次加载,因为编译器显然意识到 & 1
在每个奇数步骤上的计算结果为真:
00007FF7B45110C0 test edx,edx // always 1
00007FF7B45110C2 je someTest+63h (07FF7B45110D3h)
... addresses in between omitted ...
00007FF7B45110D3 test r8d,r8d // always 0
00007FF7B45110D6 je someTest+77h (07FF7B45110E7h)
... addresses in between omitted ...
00007FF7B45110E7 test edx,edx // always 1
00007FF7B45110E9 je someTest+8Ah (07FF7B45110FAh)
... addresses in between omitted ...
00007FF7B45110FA test r8d,r8d // always 0
00007FF7B45110FD je someTest+9Eh (07FF7B451110Eh)
... addresses in between omitted ...
00007FF7B451110E test edx,edx // always 1
00007FF7B4511110 je someTest+0B1h (07FF7B4511121h)
我无法理解 MSVC 编译器如何展开以下循环(抱歉我对汇编语言的理解很差):
#define NUM_ITERATIONS (1000 * 1000 * 1000)
double dummySum = 0;
for (int x = 0; x < NUM_ITERATIONS; x++) {
if (x & 1)
dummySum += x;
}
这是生成的程序集:
00007FF7B4511070 xorps xmm1,xmm1
double dummySum = 0;
00007FF7B4511073 mov ecx,2
00007FF7B4511078 nop dword ptr [rax+rax]
if (x & 1)
00007FF7B4511080 lea eax,[rcx-2]
00007FF7B4511083 mov r8d,eax
00007FF7B4511086 and r8d,1
00007FF7B451108A je someTest+28h (07FF7B4511098h)
dummySum += x;
00007FF7B451108C movd xmm0,eax
00007FF7B4511090 cvtdq2pd xmm0,xmm0
00007FF7B4511094 addsd xmm1,xmm0
if (x & 1)
00007FF7B4511098 lea edx,[rcx-1]
00007FF7B451109B and edx,1
00007FF7B451109E je someTest+3Fh (07FF7B45110AFh)
dummySum += x;
00007FF7B45110A0 lea eax,[rcx-1]
00007FF7B45110A3 movd xmm0,eax
00007FF7B45110A7 cvtdq2pd xmm0,xmm0
00007FF7B45110AB addsd xmm1,xmm0
00007FF7B45110AF test r8d,r8d
if (x & 1)
00007FF7B45110B2 je someTest+50h (07FF7B45110C0h)
dummySum += x;
00007FF7B45110B4 movd xmm0,ecx
00007FF7B45110B8 cvtdq2pd xmm0,xmm0
00007FF7B45110BC addsd xmm1,xmm0
00007FF7B45110C0 test edx,edx
if (x & 1)
00007FF7B45110C2 je someTest+63h (07FF7B45110D3h)
dummySum += x;
00007FF7B45110C4 lea eax,[rcx+1]
00007FF7B45110C7 movd xmm0,eax
00007FF7B45110CB cvtdq2pd xmm0,xmm0
00007FF7B45110CF addsd xmm1,xmm0
00007FF7B45110D3 test r8d,r8d
if (x & 1)
00007FF7B45110D6 je someTest+77h (07FF7B45110E7h)
dummySum += x;
00007FF7B45110D8 lea eax,[rcx+2]
00007FF7B45110DB movd xmm0,eax
00007FF7B45110DF cvtdq2pd xmm0,xmm0
00007FF7B45110E3 addsd xmm1,xmm0
00007FF7B45110E7 test edx,edx
if (x & 1)
00007FF7B45110E9 je someTest+8Ah (07FF7B45110FAh)
dummySum += x;
00007FF7B45110EB lea eax,[rcx+3]
00007FF7B45110EE movd xmm0,eax
00007FF7B45110F2 cvtdq2pd xmm0,xmm0
00007FF7B45110F6 addsd xmm1,xmm0
00007FF7B45110FA test r8d,r8d
if (x & 1)
00007FF7B45110FD je someTest+9Eh (07FF7B451110Eh)
dummySum += x;
00007FF7B45110FF lea eax,[rcx+4]
00007FF7B4511102 movd xmm0,eax
00007FF7B4511106 cvtdq2pd xmm0,xmm0
00007FF7B451110A addsd xmm1,xmm0
00007FF7B451110E test edx,edx
if (x & 1)
00007FF7B4511110 je someTest+0B1h (07FF7B4511121h)
dummySum += x;
00007FF7B4511112 lea eax,[rcx+5]
00007FF7B4511115 movd xmm0,eax
00007FF7B4511119 cvtdq2pd xmm0,xmm0
00007FF7B451111D addsd xmm1,xmm0
00007FF7B4511121 test r8d,r8d
if (x & 1)
00007FF7B4511124 je someTest+0C5h (07FF7B4511135h)
dummySum += x;
00007FF7B4511126 lea eax,[rcx+6]
00007FF7B4511129 movd xmm0,eax
00007FF7B451112D cvtdq2pd xmm0,xmm0
00007FF7B4511131 addsd xmm1,xmm0
00007FF7B4511135 test edx,edx
if (x & 1)
00007FF7B4511137 je someTest+0D8h (07FF7B4511148h)
dummySum += x;
00007FF7B4511139 lea eax,[rcx+7]
00007FF7B451113C movd xmm0,eax
00007FF7B4511140 cvtdq2pd xmm0,xmm0
00007FF7B4511144 addsd xmm1,xmm0
for (int x = 0; x < NUM_ITERATIONS; x++) {
00007FF7B4511148 add ecx,0Ah
00007FF7B451114B lea eax,[rcx-2]
00007FF7B451114E cmp eax,3B9ACA00h
00007FF7B4511153 jl someTest+10h (07FF7B4511080h)
}
我理解这部分(循环的开始):
// if (x % 2 == 0) jump over the sumation
00007FF7B4511073 mov ecx,2 // ecx/rcx = 2
00007FF7B4511080 lea eax,[rcx-2] // eax = rcx - 2
00007FF7B4511083 mov r8d,eax // r8d = eax
00007FF7B4511086 and r8d,1 // r8x & 1
00007FF7B451108A je someTest+28h (07FF7B4511098h) // jump if zero
// add double
00007FF7B451108C movd xmm0,eax
00007FF7B4511090 cvtdq2pd xmm0,xmm0
00007FF7B4511094 addsd xmm1,xmm0
但我不明白后续的跳转指令如何跳过下一条 lea
指令,如果我查看地址(这是假设发生了跳转)- 注意我在上面的列表中省略了跳转之间的指令:
00007FF7B45110C0 test edx,edx
00007FF7B45110C2 je someTest+63h (07FF7B45110D3h)
... addresses in between omitted ...
00007FF7B45110D3 test r8d,r8d
00007FF7B45110D6 je someTest+77h (07FF7B45110E7h)
... addresses in between omitted ...
00007FF7B45110E7 test edx,edx
00007FF7B45110E9 je someTest+8Ah (07FF7B45110FAh)
... addresses in between omitted ...
00007FF7B45110FA test r8d,r8d
00007FF7B45110FD je someTest+9Eh (07FF7B451110Eh)
... addresses in between omitted ...
00007FF7B451110E test edx,edx
00007FF7B4511110 je someTest+0B1h (07FF7B4511121h)
如果每次跳转都发生,似乎只是交替 test r8d,r8d
和 test edx,edx
指令,而不加载下一个值。
我在这里解释错了什么?
好的,知道了,我一步一步的拆解了;编译器相当聪明。循环展开为每次迭代执行 10 次,这些指令的排列方式是 r8d
和 edx
每次迭代仅加载一次 :
lea eax,[rcx-2]
mov r8d,eax
and r8d,1 // r8d is 0 here
...
lea edx,[rcx-1]
and edx,1 // edx is 1 here
在那之后,这些寄存器不会在迭代的其余部分再次加载,因为编译器显然意识到 & 1
在每个奇数步骤上的计算结果为真:
00007FF7B45110C0 test edx,edx // always 1
00007FF7B45110C2 je someTest+63h (07FF7B45110D3h)
... addresses in between omitted ...
00007FF7B45110D3 test r8d,r8d // always 0
00007FF7B45110D6 je someTest+77h (07FF7B45110E7h)
... addresses in between omitted ...
00007FF7B45110E7 test edx,edx // always 1
00007FF7B45110E9 je someTest+8Ah (07FF7B45110FAh)
... addresses in between omitted ...
00007FF7B45110FA test r8d,r8d // always 0
00007FF7B45110FD je someTest+9Eh (07FF7B451110Eh)
... addresses in between omitted ...
00007FF7B451110E test edx,edx // always 1
00007FF7B4511110 je someTest+0B1h (07FF7B4511121h)