为什么 -O1 比 -O2 快
Why -O1 is faster than -O2
我写了一个这样的C代码:
#include <stdio.h>
#define N 19
int main(void){
int a[N];
int ans = 0;
for(int i = 0; i < N; ++i){
a[i] = 0;
}
for(;;){
int i;
++ans;
for(i = N - 1; a[i] == 2; --i){
if(i == 0){
printf("%d\n", ans);
return 0;
}else{
a[i] = 0;
}
}
++a[i];
}
}
这里计算从0到2中选择N(=19)个数的方式,并打印出方式数(=3^19=1,162,261,467)。
我用 gcc 编译了这段代码。 -O1 比 -O2 快。为什么 -O2 优化比 -O1 差?
- CPU: Intel(R) Core(TM) i7-8565U, x86_64
- OS:架构 Linux (5.9.1-arch1-1)
- 编译器:gcc (GCC) 10.2.0
编辑:
运行 带有 -S 选项的 gcc 生成了以下汇编代码:
-O1
.file "a.c"
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "%d\n"
.text
.globl main
.type main, @function
main:
.LFB11:
.cfi_startproc
subq 4, %rsp
.cfi_def_cfa_offset 112
movq %fs:40, %rax
movq %rax, 88(%rsp)
xorl %eax, %eax
movq %rsp, %rax
leaq 76(%rsp), %rdx
.L2:
movl [=11=], (%rax)
addq , %rax
cmpq %rdx, %rax
jne .L2
movl [=11=], %esi
jmp .L7
.L4:
movslq %edx, %rdx
addl , %ecx
movl %ecx, (%rsp,%rdx,4)
.L7:
addl , %esi
movl 72(%rsp), %ecx
leaq 68(%rsp), %rax
movl , %edx
cmpl , %ecx
jne .L4
.L5:
movl [=11=], 4(%rax)
subl , %edx
movl (%rax), %ecx
cmpl , %ecx
jne .L4
subq , %rax
testl %edx, %edx
jne .L5
leaq .LC0(%rip), %rdi
movl [=11=], %eax
call printf@PLT
movq 88(%rsp), %rax
subq %fs:40, %rax
jne .L14
movl [=11=], %eax
addq 4, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 8
ret
.L14:
.cfi_restore_state
call __stack_chk_fail@PLT
.cfi_endproc
.LFE11:
.size main, .-main
.ident "GCC: (GNU) 10.2.0"
.section .note.GNU-stack,"",@progbits
-O2
.file "a.c"
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "%d\n"
.section .text.startup,"ax",@progbits
.p2align 4
.globl main
.type main, @function
main:
.LFB11:
.cfi_startproc
subq 4, %rsp
.cfi_def_cfa_offset 112
movl , %ecx
xorl %esi, %esi
movq %fs:40, %rax
movq %rax, 88(%rsp)
xorl %eax, %eax
movq %rsp, %rdx
movq %rdx, %rdi
rep stosq
movl [=12=], (%rdi)
leaq 68(%rsp), %rdi
.L6:
movl 72(%rsp), %ecx
addl , %esi
movq %rdi, %rax
movl , %edx
cmpl , %ecx
je .L4
jmp .L3
.p2align 4,,10
.p2align 3
.L5:
subq , %rax
testl %edx, %edx
je .L14
.L4:
movl (%rax), %ecx
movl [=12=], 4(%rax)
subl , %edx
cmpl , %ecx
je .L5
.L3:
movslq %edx, %rdx
addl , %ecx
movl %ecx, (%rsp,%rdx,4)
jmp .L6
.p2align 4,,10
.p2align 3
.L14:
xorl %eax, %eax
leaq .LC0(%rip), %rdi
call printf@PLT
movq 88(%rsp), %rax
subq %fs:40, %rax
jne .L15
xorl %eax, %eax
addq 4, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 8
ret
.L15:
.cfi_restore_state
call __stack_chk_fail@PLT
.cfi_endproc
.LFE11:
.size main, .-main
.ident "GCC: (GNU) 10.2.0"
.section .note.GNU-stack,"",@progbits
基准是:
$ gcc a.c -O1
$ time ./a.out
1162261467
real 0m0.895s
user 0m0.894s
sys 0m0.000s
$ time ./a.out
1162261467
real 0m0.912s
user 0m0.911s
sys 0m0.000s
$ time ./a.out
1162261467
real 0m0.925s
user 0m0.924s
sys 0m0.001s
$ gcc a.c -O2
$ time ./a.out
1162261467
real 0m1.570s
user 0m1.568s
sys 0m0.000s
$ time ./a.out
1162261467
real 0m1.567s
user 0m1.562s
sys 0m0.004s
$ time ./a.out
1162261467
real 0m1.576s
user 0m1.568s
sys 0m0.001s
$ gcc a.c -O3
$ time ./a.out
1162261467
real 0m1.613s
user 0m1.612s
sys 0m0.000s
$ time ./a.out
1162261467
real 0m1.608s
user 0m1.599s
sys 0m0.003s
$ time ./a.out
1162261467
real 0m1.628s
user 0m1.628s
sys 0m0.000s
$ gcc a.c -Ofast
$ time ./a.out
1162261467
real 0m1.571s
user 0m1.570s
sys 0m0.001s
$ time ./a.out
1162261467
real 0m1.604s
user 0m1.595s
sys 0m0.004s
$ time ./a.out
1162261467
real 0m1.616s
user 0m1.613s
sys 0m0.000s
$ gcc a.c -O0
$ time ./a.out
1162261467
real 0m2.457s
user 0m2.456s
sys 0m0.001s
$ time ./a.out
1162261467
real 0m2.526s
user 0m2.525s
sys 0m0.000s
$ time ./a.out
1162261467
real 0m2.565s
user 0m2.565s
sys 0m0.000s
编辑:
我这样编辑代码:
#include <stdio.h>
#define N 19
volatile int answer;
int main(void){
int a[N];
int ans = 0;
for(int i = 0; i < N; ++i){
a[i] = 0;
}
for(;;){
int i;
++ans;
for(i = N - 1; a[i] == 2; --i){
if(i == 0){
answer = ans;
return 0;
}else{
a[i] = 0;
}
}
++a[i];
}
}
并再次测量:
$ gcc a.c -O1
$ time ./a.out
real 0m0.924s
user 0m0.924s
sys 0m0.000s
$ time ./a.out
real 0m0.950s
user 0m0.949s
sys 0m0.000s
$ time ./a.out
real 0m0.993s
user 0m0.989s
sys 0m0.004s
$ gcc a.c -O2
$ time ./a.out
real 0m1.637s
user 0m1.636s
sys 0m0.000s
$ time ./a.out
real 0m1.661s
user 0m1.656s
sys 0m0.004s
$ time ./a.out
real 0m1.656s
user 0m1.654s
sys 0m0.001s
编辑:
我在for(;;)
之后添加了[[likely]]
属性:
#include <stdio.h>
#define N 19
int main(void){
int a[N];
int ans = 0;
for(int i = 0; i < N; ++i){
a[i] = 0;
}
for(;;) [[likely]] {
int i;
++ans;
for(i = N - 1; a[i] == 2; --i){
if(i == 0){
printf("%d\n", ans);
return 0;
}else{
a[i] = 0;
}
}
++a[i];
}
}
然后,基准测试的结果发生了变化:
$ g++ a.cpp -O1
$ for i in {1..5}; do time ./a.out; done
1162261467
./a.out 0.65s user 0.00s system 99% cpu 0.653 total
1162261467
./a.out 0.65s user 0.00s system 99% cpu 0.657 total
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.656 total
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.665 total
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.660 total
$ g++ a.cpp -O2
$ for i in {1..5}; do time ./a.out; done
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.661 total
1162261467
./a.out 0.65s user 0.00s system 99% cpu 0.648 total
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.659 total
1162261467
./a.out 0.65s user 0.00s system 99% cpu 0.654 total
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.657 total
-O2 和-O1 一样快!谢谢@Acorn。
-O2除了O1之外还开启了很多选项,例如-falign-functions -falign-jumps -falign-labels -falign-loops
。他们每个人似乎都对 -O1 产生了负面的性能影响。我有 i7-8550U 和 GCC 9.3.0-17ubuntu1~20.04.
我相信分支预测失败会使处理器变得困难。
Why -O2 optimization was worse than -O1?
在大多数情况下,更高的优化级别应该会给您带来更好的性能。不过,您可能会发现像这样的例外情况。特别是在像这样的微基准测试中。
您的程序使用的代码和数据内存非常小,缓存和内存访问不太可能成为问题。然而,它是分支繁重的,这意味着它将归结为静态和动态分支预测。
如果您的编译器出现错误,例如在这种情况下,您可以尝试通过 likely/unlikely 提示或分析程序来提供更多信息。
我写了一个这样的C代码:
#include <stdio.h>
#define N 19
int main(void){
int a[N];
int ans = 0;
for(int i = 0; i < N; ++i){
a[i] = 0;
}
for(;;){
int i;
++ans;
for(i = N - 1; a[i] == 2; --i){
if(i == 0){
printf("%d\n", ans);
return 0;
}else{
a[i] = 0;
}
}
++a[i];
}
}
这里计算从0到2中选择N(=19)个数的方式,并打印出方式数(=3^19=1,162,261,467)。
我用 gcc 编译了这段代码。 -O1 比 -O2 快。为什么 -O2 优化比 -O1 差?
- CPU: Intel(R) Core(TM) i7-8565U, x86_64
- OS:架构 Linux (5.9.1-arch1-1)
- 编译器:gcc (GCC) 10.2.0
编辑:
运行 带有 -S 选项的 gcc 生成了以下汇编代码: -O1
.file "a.c"
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "%d\n"
.text
.globl main
.type main, @function
main:
.LFB11:
.cfi_startproc
subq 4, %rsp
.cfi_def_cfa_offset 112
movq %fs:40, %rax
movq %rax, 88(%rsp)
xorl %eax, %eax
movq %rsp, %rax
leaq 76(%rsp), %rdx
.L2:
movl [=11=], (%rax)
addq , %rax
cmpq %rdx, %rax
jne .L2
movl [=11=], %esi
jmp .L7
.L4:
movslq %edx, %rdx
addl , %ecx
movl %ecx, (%rsp,%rdx,4)
.L7:
addl , %esi
movl 72(%rsp), %ecx
leaq 68(%rsp), %rax
movl , %edx
cmpl , %ecx
jne .L4
.L5:
movl [=11=], 4(%rax)
subl , %edx
movl (%rax), %ecx
cmpl , %ecx
jne .L4
subq , %rax
testl %edx, %edx
jne .L5
leaq .LC0(%rip), %rdi
movl [=11=], %eax
call printf@PLT
movq 88(%rsp), %rax
subq %fs:40, %rax
jne .L14
movl [=11=], %eax
addq 4, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 8
ret
.L14:
.cfi_restore_state
call __stack_chk_fail@PLT
.cfi_endproc
.LFE11:
.size main, .-main
.ident "GCC: (GNU) 10.2.0"
.section .note.GNU-stack,"",@progbits
-O2
.file "a.c"
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "%d\n"
.section .text.startup,"ax",@progbits
.p2align 4
.globl main
.type main, @function
main:
.LFB11:
.cfi_startproc
subq 4, %rsp
.cfi_def_cfa_offset 112
movl , %ecx
xorl %esi, %esi
movq %fs:40, %rax
movq %rax, 88(%rsp)
xorl %eax, %eax
movq %rsp, %rdx
movq %rdx, %rdi
rep stosq
movl [=12=], (%rdi)
leaq 68(%rsp), %rdi
.L6:
movl 72(%rsp), %ecx
addl , %esi
movq %rdi, %rax
movl , %edx
cmpl , %ecx
je .L4
jmp .L3
.p2align 4,,10
.p2align 3
.L5:
subq , %rax
testl %edx, %edx
je .L14
.L4:
movl (%rax), %ecx
movl [=12=], 4(%rax)
subl , %edx
cmpl , %ecx
je .L5
.L3:
movslq %edx, %rdx
addl , %ecx
movl %ecx, (%rsp,%rdx,4)
jmp .L6
.p2align 4,,10
.p2align 3
.L14:
xorl %eax, %eax
leaq .LC0(%rip), %rdi
call printf@PLT
movq 88(%rsp), %rax
subq %fs:40, %rax
jne .L15
xorl %eax, %eax
addq 4, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 8
ret
.L15:
.cfi_restore_state
call __stack_chk_fail@PLT
.cfi_endproc
.LFE11:
.size main, .-main
.ident "GCC: (GNU) 10.2.0"
.section .note.GNU-stack,"",@progbits
基准是:
$ gcc a.c -O1
$ time ./a.out
1162261467
real 0m0.895s
user 0m0.894s
sys 0m0.000s
$ time ./a.out
1162261467
real 0m0.912s
user 0m0.911s
sys 0m0.000s
$ time ./a.out
1162261467
real 0m0.925s
user 0m0.924s
sys 0m0.001s
$ gcc a.c -O2
$ time ./a.out
1162261467
real 0m1.570s
user 0m1.568s
sys 0m0.000s
$ time ./a.out
1162261467
real 0m1.567s
user 0m1.562s
sys 0m0.004s
$ time ./a.out
1162261467
real 0m1.576s
user 0m1.568s
sys 0m0.001s
$ gcc a.c -O3
$ time ./a.out
1162261467
real 0m1.613s
user 0m1.612s
sys 0m0.000s
$ time ./a.out
1162261467
real 0m1.608s
user 0m1.599s
sys 0m0.003s
$ time ./a.out
1162261467
real 0m1.628s
user 0m1.628s
sys 0m0.000s
$ gcc a.c -Ofast
$ time ./a.out
1162261467
real 0m1.571s
user 0m1.570s
sys 0m0.001s
$ time ./a.out
1162261467
real 0m1.604s
user 0m1.595s
sys 0m0.004s
$ time ./a.out
1162261467
real 0m1.616s
user 0m1.613s
sys 0m0.000s
$ gcc a.c -O0
$ time ./a.out
1162261467
real 0m2.457s
user 0m2.456s
sys 0m0.001s
$ time ./a.out
1162261467
real 0m2.526s
user 0m2.525s
sys 0m0.000s
$ time ./a.out
1162261467
real 0m2.565s
user 0m2.565s
sys 0m0.000s
编辑:
我这样编辑代码:
#include <stdio.h>
#define N 19
volatile int answer;
int main(void){
int a[N];
int ans = 0;
for(int i = 0; i < N; ++i){
a[i] = 0;
}
for(;;){
int i;
++ans;
for(i = N - 1; a[i] == 2; --i){
if(i == 0){
answer = ans;
return 0;
}else{
a[i] = 0;
}
}
++a[i];
}
}
并再次测量:
$ gcc a.c -O1
$ time ./a.out
real 0m0.924s
user 0m0.924s
sys 0m0.000s
$ time ./a.out
real 0m0.950s
user 0m0.949s
sys 0m0.000s
$ time ./a.out
real 0m0.993s
user 0m0.989s
sys 0m0.004s
$ gcc a.c -O2
$ time ./a.out
real 0m1.637s
user 0m1.636s
sys 0m0.000s
$ time ./a.out
real 0m1.661s
user 0m1.656s
sys 0m0.004s
$ time ./a.out
real 0m1.656s
user 0m1.654s
sys 0m0.001s
编辑:
我在for(;;)
之后添加了[[likely]]
属性:
#include <stdio.h>
#define N 19
int main(void){
int a[N];
int ans = 0;
for(int i = 0; i < N; ++i){
a[i] = 0;
}
for(;;) [[likely]] {
int i;
++ans;
for(i = N - 1; a[i] == 2; --i){
if(i == 0){
printf("%d\n", ans);
return 0;
}else{
a[i] = 0;
}
}
++a[i];
}
}
然后,基准测试的结果发生了变化:
$ g++ a.cpp -O1
$ for i in {1..5}; do time ./a.out; done
1162261467
./a.out 0.65s user 0.00s system 99% cpu 0.653 total
1162261467
./a.out 0.65s user 0.00s system 99% cpu 0.657 total
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.656 total
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.665 total
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.660 total
$ g++ a.cpp -O2
$ for i in {1..5}; do time ./a.out; done
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.661 total
1162261467
./a.out 0.65s user 0.00s system 99% cpu 0.648 total
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.659 total
1162261467
./a.out 0.65s user 0.00s system 99% cpu 0.654 total
1162261467
./a.out 0.66s user 0.00s system 99% cpu 0.657 total
-O2 和-O1 一样快!谢谢@Acorn。
-O2除了O1之外还开启了很多选项,例如-falign-functions -falign-jumps -falign-labels -falign-loops
。他们每个人似乎都对 -O1 产生了负面的性能影响。我有 i7-8550U 和 GCC 9.3.0-17ubuntu1~20.04.
我相信分支预测失败会使处理器变得困难。
Why -O2 optimization was worse than -O1?
在大多数情况下,更高的优化级别应该会给您带来更好的性能。不过,您可能会发现像这样的例外情况。特别是在像这样的微基准测试中。
您的程序使用的代码和数据内存非常小,缓存和内存访问不太可能成为问题。然而,它是分支繁重的,这意味着它将归结为静态和动态分支预测。
如果您的编译器出现错误,例如在这种情况下,您可以尝试通过 likely/unlikely 提示或分析程序来提供更多信息。