编译器是否只完全展开外循环?
Does compiler only unroll the outer loop completely?
我尝试编译此代码并使用特定于循环的编译指示告诉编译器展开计数循环的次数。
#include <vector>
int main() {
std::vector<int> v(8192);
#pragma GCC unroll 8 // 16
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 512; j++) {
v[i*512+j] = i*j;
}
}
return 0;
}
当我将 #pragma GCC unroll 8
放在外部 for
循环之前时,编译器不会展开。
.L3:
movd xmm7, ecx
mov rax, rsi
movdqa xmm2, xmm6
pshufd xmm3, xmm7, 0
movdqa xmm4, xmm3
psrlq xmm4, 32
.L4:
movdqa xmm0, xmm2
movdqa xmm1, xmm3
paddd xmm2, xmm5
add rax, 16
pmuludq xmm1, xmm0
psrlq xmm0, 32
pmuludq xmm0, xmm4
pshufd xmm1, xmm1, 8
pshufd xmm0, xmm0, 8
punpckldq xmm1, xmm0
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L4
add ecx, 1
add rsi, 2048
lea rdx, [rax+2048]
cmp ecx, 16
jne .L3
mov rdi, rbp
mov esi, 16384
call _ZdlPvm
xor eax, eax
pop rbp
ret
但是当我将 #pragma GCC unroll 16
放在外部 for
循环之前时,编译器成功展开了外部循环。
.L2:
lea rdi, [rbp+8]
mov rcx, rbp
movdqa xmm2, XMMWORD PTR .LC0[rip]
xor eax, eax
and rdi, -8
movdqa xmm0, XMMWORD PTR .LC1[rip]
mov QWORD PTR [rbp+0], 0
lea rdx, [rbp+4096]
sub rcx, rdi
movdqa xmm1, xmm2
mov QWORD PTR [rbp+2040], 0
add ecx, 2048
shr ecx, 3
rep stosq
lea rax, [rbp+2048]
.L3:
movdqa xmm3, xmm1
add rax, 16
paddd xmm1, xmm0
movups XMMWORD PTR [rax-16], xmm3
cmp rax, rdx
jne .L3
lea rdx, [rbp+6144]
movdqa xmm3, xmm2
.L4:
movdqa xmm1, xmm3
add rax, 16
paddd xmm3, xmm0
pslld xmm1, 1
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L4
lea rdx, [rbp+8192]
movdqa xmm3, xmm2
.L5:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 1
paddd xmm1, xmm4
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L5
mov rax, rdx
movdqa xmm3, xmm2
lea rdx, [rbp+10240]
.L6:
movdqa xmm1, xmm3
add rax, 16
paddd xmm3, xmm0
pslld xmm1, 2
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L6
mov rdx, rax
movdqa xmm3, xmm2
lea rax, [rbp+12288]
.L7:
movdqa xmm4, xmm3
add rdx, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 2
paddd xmm1, xmm4
movups XMMWORD PTR [rdx-16], xmm1
cmp rax, rdx
jne .L7
lea rdx, [rbp+14336]
movdqa xmm3, xmm2
.L8:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 1
paddd xmm1, xmm4
pslld xmm1, 1
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L8
movdqa xmm3, xmm2
.L9:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 3
psubd xmm1, xmm4
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rbx
jne .L9
lea rdx, [rbp+18432]
movdqa xmm3, xmm2
.L10:
movdqa xmm1, xmm3
add rax, 16
paddd xmm3, xmm0
pslld xmm1, 3
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L10
lea rdx, [rbp+20480]
movdqa xmm3, xmm2
.L11:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 3
paddd xmm1, xmm4
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L11
lea rax, [rbp+22528]
movdqa xmm3, xmm2
.L12:
movdqa xmm4, xmm3
add rdx, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 2
paddd xmm1, xmm4
pslld xmm1, 1
movups XMMWORD PTR [rdx-16], xmm1
cmp rax, rdx
jne .L12
lea rdx, [rbp+24576]
movdqa xmm4, xmm2
.L13:
movdqa xmm3, xmm4
add rax, 16
paddd xmm4, xmm0
movdqa xmm1, xmm3
pslld xmm1, 1
paddd xmm1, xmm3
pslld xmm1, 2
psubd xmm1, xmm3
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L13
lea rdx, [rbp+26624]
movdqa xmm3, xmm2
.L14:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 1
paddd xmm1, xmm4
pslld xmm1, 2
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L14
lea rdx, [rbp+28672]
movdqa xmm4, xmm2
.L15:
movdqa xmm3, xmm4
add rax, 16
paddd xmm4, xmm0
movdqa xmm1, xmm3
pslld xmm1, 1
paddd xmm1, xmm3
pslld xmm1, 2
paddd xmm1, xmm3
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L15
lea rdx, [rbp+30720]
movdqa xmm3, xmm2
.L16:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 3
psubd xmm1, xmm4
pslld xmm1, 1
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L16
mov rax, rdx
lea rdx, [rbp+32768]
.L17:
movdqa xmm3, xmm2
add rax, 16
paddd xmm2, xmm0
movdqa xmm1, xmm3
pslld xmm1, 4
psubd xmm1, xmm3
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L17
mov rdi, rbp
mov esi, 16384
call _ZdlPvm
add rsp, 8
xor eax, eax
pop rbx
pop rbp
ret
那么编译器是否只完全展开外循环?
GCC 版本:g++(Compiler-Explorer-Build-gcc-b8ef019ab938471f7f877a1eee3a6374fd8a6ae9-binutils-2.36.1)12.0.0 20211029(实验)
选项:-O2
https://godbolt.org/z/PT6T1691W 似乎 -O2 -funroll-loops
可以解决问题,显然需要启用该选项才能让 pragma 告诉 GCC 展开多少。 (更新: 或者至少使它具有 一些 效果。查看评论,这似乎还不是一个完整的答案。)
(-funroll-loops
默认情况下不打开,除非你使用 -fprofile-use
,在执行 -fprofile-generate
运行 和 运行 具有代表性输入的程序之后. 它曾经在 -O3 a long 之前默认打开,但是代码膨胀 I-cache 压力通常会使不热的循环变得更糟。这导致低音 - GCC 花费大部分时间的循环是 SIMD 的几条指令,但完全展开的标量序言/尾声是指令数量的 10 倍,尤其是使用更宽的向量时。即使使用 AVX-512,GCC 通常也只是对奇数个元素使用标量,而不是创建掩码。:/)
完全展开循环是 GCC 即使在 -O2
也会做的事情,至少对于非常小的行程计数。 (例如,int
数组 p[i] += 1;
最多 3 个,-O2 -fno-tree-vectorize
)。 https://godbolt.org/z/P5rvjYj1b
完全展开更大的循环或更高的行程计数(当静态代码大小可能会因此增加时)似乎在 -O2
默认情况下未启用。 (GCC 在他们的 中将此 peeling 称为一个循环,即将所有迭代从循环中剥离,这样它就消失了。-fpeel-loops
与 -O3
,但不是 -O2
。从 GCC11 开始,-fverbose-asm
不再打印作为 asm 注释启用的优化选项列表。)
顺便说一句,在 GCC t运行k 中,自动矢量化似乎默认在 -O2
开启。以前它只在 -O3
播出,所以这很有趣。
我尝试编译此代码并使用特定于循环的编译指示告诉编译器展开计数循环的次数。
#include <vector>
int main() {
std::vector<int> v(8192);
#pragma GCC unroll 8 // 16
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 512; j++) {
v[i*512+j] = i*j;
}
}
return 0;
}
当我将 #pragma GCC unroll 8
放在外部 for
循环之前时,编译器不会展开。
.L3:
movd xmm7, ecx
mov rax, rsi
movdqa xmm2, xmm6
pshufd xmm3, xmm7, 0
movdqa xmm4, xmm3
psrlq xmm4, 32
.L4:
movdqa xmm0, xmm2
movdqa xmm1, xmm3
paddd xmm2, xmm5
add rax, 16
pmuludq xmm1, xmm0
psrlq xmm0, 32
pmuludq xmm0, xmm4
pshufd xmm1, xmm1, 8
pshufd xmm0, xmm0, 8
punpckldq xmm1, xmm0
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L4
add ecx, 1
add rsi, 2048
lea rdx, [rax+2048]
cmp ecx, 16
jne .L3
mov rdi, rbp
mov esi, 16384
call _ZdlPvm
xor eax, eax
pop rbp
ret
但是当我将 #pragma GCC unroll 16
放在外部 for
循环之前时,编译器成功展开了外部循环。
.L2:
lea rdi, [rbp+8]
mov rcx, rbp
movdqa xmm2, XMMWORD PTR .LC0[rip]
xor eax, eax
and rdi, -8
movdqa xmm0, XMMWORD PTR .LC1[rip]
mov QWORD PTR [rbp+0], 0
lea rdx, [rbp+4096]
sub rcx, rdi
movdqa xmm1, xmm2
mov QWORD PTR [rbp+2040], 0
add ecx, 2048
shr ecx, 3
rep stosq
lea rax, [rbp+2048]
.L3:
movdqa xmm3, xmm1
add rax, 16
paddd xmm1, xmm0
movups XMMWORD PTR [rax-16], xmm3
cmp rax, rdx
jne .L3
lea rdx, [rbp+6144]
movdqa xmm3, xmm2
.L4:
movdqa xmm1, xmm3
add rax, 16
paddd xmm3, xmm0
pslld xmm1, 1
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L4
lea rdx, [rbp+8192]
movdqa xmm3, xmm2
.L5:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 1
paddd xmm1, xmm4
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L5
mov rax, rdx
movdqa xmm3, xmm2
lea rdx, [rbp+10240]
.L6:
movdqa xmm1, xmm3
add rax, 16
paddd xmm3, xmm0
pslld xmm1, 2
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L6
mov rdx, rax
movdqa xmm3, xmm2
lea rax, [rbp+12288]
.L7:
movdqa xmm4, xmm3
add rdx, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 2
paddd xmm1, xmm4
movups XMMWORD PTR [rdx-16], xmm1
cmp rax, rdx
jne .L7
lea rdx, [rbp+14336]
movdqa xmm3, xmm2
.L8:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 1
paddd xmm1, xmm4
pslld xmm1, 1
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L8
movdqa xmm3, xmm2
.L9:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 3
psubd xmm1, xmm4
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rbx
jne .L9
lea rdx, [rbp+18432]
movdqa xmm3, xmm2
.L10:
movdqa xmm1, xmm3
add rax, 16
paddd xmm3, xmm0
pslld xmm1, 3
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L10
lea rdx, [rbp+20480]
movdqa xmm3, xmm2
.L11:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 3
paddd xmm1, xmm4
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L11
lea rax, [rbp+22528]
movdqa xmm3, xmm2
.L12:
movdqa xmm4, xmm3
add rdx, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 2
paddd xmm1, xmm4
pslld xmm1, 1
movups XMMWORD PTR [rdx-16], xmm1
cmp rax, rdx
jne .L12
lea rdx, [rbp+24576]
movdqa xmm4, xmm2
.L13:
movdqa xmm3, xmm4
add rax, 16
paddd xmm4, xmm0
movdqa xmm1, xmm3
pslld xmm1, 1
paddd xmm1, xmm3
pslld xmm1, 2
psubd xmm1, xmm3
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L13
lea rdx, [rbp+26624]
movdqa xmm3, xmm2
.L14:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 1
paddd xmm1, xmm4
pslld xmm1, 2
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L14
lea rdx, [rbp+28672]
movdqa xmm4, xmm2
.L15:
movdqa xmm3, xmm4
add rax, 16
paddd xmm4, xmm0
movdqa xmm1, xmm3
pslld xmm1, 1
paddd xmm1, xmm3
pslld xmm1, 2
paddd xmm1, xmm3
movups XMMWORD PTR [rax-16], xmm1
cmp rax, rdx
jne .L15
lea rdx, [rbp+30720]
movdqa xmm3, xmm2
.L16:
movdqa xmm4, xmm3
add rax, 16
paddd xmm3, xmm0
movdqa xmm1, xmm4
pslld xmm1, 3
psubd xmm1, xmm4
pslld xmm1, 1
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L16
mov rax, rdx
lea rdx, [rbp+32768]
.L17:
movdqa xmm3, xmm2
add rax, 16
paddd xmm2, xmm0
movdqa xmm1, xmm3
pslld xmm1, 4
psubd xmm1, xmm3
movups XMMWORD PTR [rax-16], xmm1
cmp rdx, rax
jne .L17
mov rdi, rbp
mov esi, 16384
call _ZdlPvm
add rsp, 8
xor eax, eax
pop rbx
pop rbp
ret
那么编译器是否只完全展开外循环?
GCC 版本:g++(Compiler-Explorer-Build-gcc-b8ef019ab938471f7f877a1eee3a6374fd8a6ae9-binutils-2.36.1)12.0.0 20211029(实验)
选项:-O2
https://godbolt.org/z/PT6T1691W 似乎 -O2 -funroll-loops
可以解决问题,显然需要启用该选项才能让 pragma 告诉 GCC 展开多少。 (更新: 或者至少使它具有 一些 效果。查看评论,这似乎还不是一个完整的答案。)
(-funroll-loops
默认情况下不打开,除非你使用 -fprofile-use
,在执行 -fprofile-generate
运行 和 运行 具有代表性输入的程序之后. 它曾经在 -O3 a long 之前默认打开,但是代码膨胀 I-cache 压力通常会使不热的循环变得更糟。这导致低音 - GCC 花费大部分时间的循环是 SIMD 的几条指令,但完全展开的标量序言/尾声是指令数量的 10 倍,尤其是使用更宽的向量时。即使使用 AVX-512,GCC 通常也只是对奇数个元素使用标量,而不是创建掩码。:/)
完全展开循环是 GCC 即使在 -O2
也会做的事情,至少对于非常小的行程计数。 (例如,int
数组 p[i] += 1;
最多 3 个,-O2 -fno-tree-vectorize
)。 https://godbolt.org/z/P5rvjYj1b
完全展开更大的循环或更高的行程计数(当静态代码大小可能会因此增加时)似乎在 -O2
默认情况下未启用。 (GCC 在他们的 -fpeel-loops
与 -O3
,但不是 -O2
。从 GCC11 开始,-fverbose-asm
不再打印作为 asm 注释启用的优化选项列表。)
顺便说一句,在 GCC t运行k 中,自动矢量化似乎默认在 -O2
开启。以前它只在 -O3
播出,所以这很有趣。