"if" 和 "if else" 在 Java 中的不同表现
Different performance of "if" and "if else" in Java
我注意到 if else
/ 三元 (condition ? a : b
) 赋值比 if
only 语句中的条件赋值更快。我在不同的 JDK 上执行了 JMH 基准测试,但我将专注于 JDK 12.
(ops / sec, 越高越好)
源代码:
@State(Scope.Benchmark)
public class FindMaxBenchmark {
public static int SIZE = 1_000_000;
@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public static void findMax_if(Blackhole bh, Mock mock) {
int result = Integer.MIN_VALUE;
int[] data = mock.tab;
for (int i = 0; i < data.length; i++) {
if (data[i] > result) {
result = data[i];
}
}
bh.consume(result);
}
@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public static void findMax_if_else(Blackhole bh, Mock mock) {
int result = Integer.MIN_VALUE;
int[] data = mock.tab;
for (int i = 0; i < data.length; i++) {
if (data[i] > result) {
result = data[i];
} else {
result = result;
}
}
bh.consume(result);
}
@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public static void findMax_ternary(Blackhole bh, Mock mock) {
int result = Integer.MIN_VALUE;
int[] data = mock.tab;
for (int i = 0; i < data.length; i++) {
result = data[i] > result ? data[i] : result;
}
bh.consume(result);
}
@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public static void findMax_intrinsicMax(Blackhole bh, Mock mock) {
int result = Integer.MIN_VALUE;
int[] data = mock.tab;
for (int i = 0; i < data.length; i++) {
result = Math.max(data[i], result);
}
bh.consume(result);
}
@State(Scope.Thread)
public static class Mock {
private int[] tab = new int[SIZE];
public int[] getTab() {
return tab;
}
@Setup(Level.Iteration)
public void setup() {
Random r = new Random();
this.tab = r.ints(SIZE).toArray();
}
}
}
findMax_if_else
perfasm输出(三进制差不多):
c2, level 4, codes.dbg.FindMaxBenchmark::findMax_if_else, version 493 (165 bytes)
0x00007fc7a8671a6b: cmp r8d,ebp
╭ 0x00007fc7a8671a6e: jae 0x00007fc7a8671b3d
│ 0x00007fc7a8671a74: mov edx,DWORD PTR [r9+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
│ 0x00007fc7a8671a78: cmp edx,0x80000000
│╭ 0x00007fc7a8671a7e: jg 0x00007fc7a8671a85 ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@23 (line 34)
││ 0x00007fc7a8671a80: mov edx,0x80000000 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
│↘ 0x00007fc7a8671a85: mov ebx,ebp
0.02% │ 0x00007fc7a8671a87: add ebx,0xfffffffd
│ 0x00007fc7a8671a8a: cmp r8d,ebx
│ 0x00007fc7a8671a8d: cmovl ebx,r11d
│ 0x00007fc7a8671a91: mov r8d,0x1
0.00% │ 0x00007fc7a8671a97: cmp ebx,0x1
│ ╭ 0x00007fc7a8671a9a: jle 0x00007fc7a8671b00
│ │ 0x00007fc7a8671a9c: mov rdi,r9 ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@39 (line 33)
│ │╭ 0x00007fc7a8671a9f: jmp 0x00007fc7a8671ab9
0.01% │ ││ ↗ 0x00007fc7a8671aa1: mov edx,ecx
│ ││ │ 0x00007fc7a8671aa3: nop DWORD PTR [rax+0x0]
│ ││ │ 0x00007fc7a8671aaa: nop WORD PTR [rax+rax*1+0x0]
8.06% │ ││ ↗│ 0x00007fc7a8671ab0: add r8d,0x4 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
11.38% │ ││ ││ 0x00007fc7a8671ab4: cmp r8d,ebx
13.63% │ ││╭ ││ 0x00007fc7a8671ab7: jge 0x00007fc7a8671af1 ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
│ │││ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@18 (line 34)
3.02% │ │↘│ ││ ↗ 0x00007fc7a8671ab9: mov r11d,DWORD PTR [r9+r8*4+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
8.53% │ │ │ ││ │ 0x00007fc7a8671abe: cmp r11d,edx
4.54% │ │ │╭ ││ │ 0x00007fc7a8671ac1: jg 0x00007fc7a8671ae2 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ ││ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
4.96% │ │ ││ ││↗ │ 0x00007fc7a8671ac3: mov r11d,DWORD PTR [r9+r8*4+0x14] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ ││ │││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
3.73% │ │ ││ │││ │ 0x00007fc7a8671ac8: cmp r11d,edx
9.19% │ │ ││╭ │││ │ 0x00007fc7a8671acb: jg 0x00007fc7a8671ae7 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ │││ │││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
3.70% │ │ │││ │││↗ │ 0x00007fc7a8671acd: mov r11d,DWORD PTR [r9+r8*4+0x18] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │││ ││││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
4.96% │ │ │││ ││││ │ 0x00007fc7a8671ad2: cmp r11d,edx
4.45% │ │ │││╭││││ │ 0x00007fc7a8671ad5: jg 0x00007fc7a8671aec ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││││││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
8.55% │ │ ││││││││↗│ 0x00007fc7a8671ad7: mov ecx,DWORD PTR [r9+r8*4+0x1c] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││││││││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
6.11% │ │ ││││││││││ 0x00007fc7a8671adc: cmp ecx,edx
2.48% │ │ ││││╰│││││ 0x00007fc7a8671ade: jle 0x00007fc7a8671ab0 ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││ │││││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@23 (line 34)
│ │ ││││ ╰││││ 0x00007fc7a8671ae0: jmp 0x00007fc7a8671aa1
│ │ │↘││ ││││ 0x00007fc7a8671ae2: mov edx,r11d
0.00% │ │ │ ││ ╰│││ 0x00007fc7a8671ae5: jmp 0x00007fc7a8671ac3
0.00% │ │ │ ↘│ │││ 0x00007fc7a8671ae7: mov edx,r11d
0.00% │ │ │ │ ╰││ 0x00007fc7a8671aea: jmp 0x00007fc7a8671acd
0.00% │ │ │ ↘ ││ 0x00007fc7a8671aec: mov edx,r11d
0.00% │ │ │ ╰│ 0x00007fc7a8671aef: jmp 0x00007fc7a8671ad7
│ │ ↘ │ 0x00007fc7a8671af1: mov r11,QWORD PTR [r15+0x108] ; ImmutableOopMap{r10=Oop r9=NarrowOop rdi=Oop }
│ │ │ ;*goto {reexecute=1 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@39 (line 33)
│ │ │ 0x00007fc7a8671af8: test DWORD PTR [r11],eax ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@39 (line 33)
│ │ │ ; {poll}
│ │ │ 0x00007fc7a8671afb: cmp r8d,ebx
0.00% │ │ ╰ 0x00007fc7a8671afe: jl 0x00007fc7a8671ab9
│ ↘ 0x00007fc7a8671b00: cmp r8d,ebp
0.00% │ ╭ 0x00007fc7a8671b03: jge 0x00007fc7a8671b1a
│ │ 0x00007fc7a8671b05: data16 xchg ax,ax ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@18 (line 34)
│ │ ↗ 0x00007fc7a8671b08: mov r11d,DWORD PTR [r9+r8*4+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
0.01% │ │ │ 0x00007fc7a8671b0d: cmp r11d,edx
│ │╭│ 0x00007fc7a8671b10: jg 0x00007fc7a8671b38
│ │││↗ 0x00007fc7a8671b12: inc r8d ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
│ ││││ 0x00007fc7a8671b15: cmp r8d,ebp
│ ││╰│ 0x00007fc7a8671b18: jl 0x00007fc7a8671b08 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
│ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@15 (line 33)
│ ↘│ │ 0x00007fc7a8671b1a: test r10,r10
0.00% │ │ │ 0x00007fc7a8671b1d: je 0x00007fc7a8671b52
│ │ │ 0x00007fc7a8671b1f: mov rsi,r10
│ │ │ 0x00007fc7a8671b22: nop
│ │ │ 0x00007fc7a8671b23: call 0x00007fc7a8671ba0 ; ImmutableOopMap{}
│ │ │ ;*invokevirtual consume {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@44 (line 41)
│ │ │ ; {optimized virtual_call}
│ │ │ 0x00007fc7a8671b28: add rsp,0x20
0.01% │ │ │ 0x00007fc7a8671b2c: pop rbp
│ │ │ 0x00007fc7a8671b2d: mov r10,QWORD PTR [r15+0x108]
│ │ │ 0x00007fc7a8671b34: test DWORD PTR [r10],eax ; {poll_return}
│ │ │ 0x00007fc7a8671b37: ret
│ ↘ │ 0x00007fc7a8671b38: mov edx,r11d
│ ╰ 0x00007fc7a8671b3b: jmp 0x00007fc7a8671b12
↘ 0x00007fc7a8671b3d: mov esi,0xffffff7e
0x00007fc7a8671b42: mov QWORD PTR [rsp],r10
0x00007fc7a8671b46: mov DWORD PTR [rsp+0x8],r9d
0x00007fc7a8671b4b: call 0x00007fc7a0ba3d00 ; ImmutableOopMap{[0]=Oop [8]=NarrowOop }
;*if_icmpge {reexecute=1 rethrow=0 return_oop=0}
findMax_if
性能输出:
c2, level 4, codes.dbg.FindMaxBenchmark::findMax_if, version 480 (165 bytes)
0x00007f34cc66e7eb: cmp r8d,ebp
╭ 0x00007f34cc66e7ee: jae 0x00007f34cc66e8c4
│ 0x00007f34cc66e7f4: mov edx,DWORD PTR [r9+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
│ 0x00007f34cc66e7f8: cmp edx,0x80000000
│╭ 0x00007f34cc66e7fe: jg 0x00007f34cc66e805 ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
││ ; - codes.dbg.FindMaxBenchmark::findMax_if@23 (line 19)
││ 0x00007f34cc66e800: mov edx,0x80000000 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
││ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
│↘ 0x00007f34cc66e805: mov ebx,ebp
0.01% │ 0x00007f34cc66e807: add ebx,0xfffffffd
│ 0x00007f34cc66e80a: cmp r8d,ebx
│ 0x00007f34cc66e80d: cmovl ebx,r11d
│ 0x00007f34cc66e811: mov r8d,0x1
│ 0x00007f34cc66e817: cmp ebx,0x1
│ ╭ 0x00007f34cc66e81a: jle 0x00007f34cc66e880
│ │ 0x00007f34cc66e81c: mov rdi,r9 ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@34 (line 18)
│ │╭ 0x00007f34cc66e81f: jmp 0x00007f34cc66e839
│ ││ ↗ 0x00007f34cc66e821: mov edx,ecx
0.00% │ ││ │ 0x00007f34cc66e823: nop DWORD PTR [rax+0x0]
│ ││ │ 0x00007f34cc66e82a: nop WORD PTR [rax+rax*1+0x0]
0.89% │ ││ │↗ 0x00007f34cc66e830: add r8d,0x4 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
12.36% │ ││ ││ 0x00007f34cc66e834: cmp r8d,ebx
0.11% │ ││╭ ││ 0x00007f34cc66e837: jge 0x00007f34cc66e871 ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
│ │││ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if@18 (line 19)
9.94% │ │↘│ ││ ↗ 0x00007f34cc66e839: mov r11d,DWORD PTR [r9+r8*4+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
0.11% │ │ │ ││ │ 0x00007f34cc66e83e: cmp r11d,edx
10.05% │ │ │╭ ││ │ 0x00007f34cc66e841: jg 0x00007f34cc66e862 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ ││ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
0.13% │ │ ││ ││↗ │ 0x00007f34cc66e843: mov r11d,DWORD PTR [r9+r8*4+0x14] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ ││ │││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
9.84% │ │ ││ │││ │ 0x00007f34cc66e848: cmp r11d,edx
0.11% │ │ ││╭ │││ │ 0x00007f34cc66e84b: jg 0x00007f34cc66e867 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ │││ │││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
10.02% │ │ │││ │││↗ │ 0x00007f34cc66e84d: mov r11d,DWORD PTR [r9+r8*4+0x18] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │││ ││││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
0.33% │ │ │││ ││││ │ 0x00007f34cc66e852: cmp r11d,edx
23.63% │ │ │││╭││││ │ 0x00007f34cc66e855: jg 0x00007f34cc66e86c ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││││││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
0.13% │ │ ││││││││↗│ 0x00007f34cc66e857: mov ecx,DWORD PTR [r9+r8*4+0x1c] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││││││││ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
9.89% │ │ ││││││││││ 0x00007f34cc66e85c: cmp ecx,edx
0.11% │ │ ││││╰│││││ 0x00007f34cc66e85e: jg 0x00007f34cc66e821 ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││ │││││ ; - codes.dbg.FindMaxBenchmark::findMax_if@23 (line 19)
9.71% │ │ ││││ ╰││││ 0x00007f34cc66e860: jmp 0x00007f34cc66e830
│ │ │↘││ ││││ 0x00007f34cc66e862: mov edx,r11d
0.00% │ │ │ ││ ╰│││ 0x00007f34cc66e865: jmp 0x00007f34cc66e843
│ │ │ ↘│ │││ 0x00007f34cc66e867: mov edx,r11d
0.00% │ │ │ │ ╰││ 0x00007f34cc66e86a: jmp 0x00007f34cc66e84d
│ │ │ ↘ ││ 0x00007f34cc66e86c: mov edx,r11d
0.00% │ │ │ ╰│ 0x00007f34cc66e86f: jmp 0x00007f34cc66e857
│ │ ↘ │ 0x00007f34cc66e871: mov r11,QWORD PTR [r15+0x108] ; ImmutableOopMap{r10=Oop r9=NarrowOop rdi=Oop }
│ │ │ ;*goto {reexecute=1 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@34 (line 18)
0.00% │ │ │ 0x00007f34cc66e878: test DWORD PTR [r11],eax ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@34 (line 18)
│ │ │ ; {poll}
│ │ │ 0x00007f34cc66e87b: cmp r8d,ebx
│ │ ╰ 0x00007f34cc66e87e: jl 0x00007f34cc66e839
│ ↘ 0x00007f34cc66e880: cmp r8d,ebp
0.00% │ ╭ 0x00007f34cc66e883: jge 0x00007f34cc66e89a
│ │ 0x00007f34cc66e885: data16 xchg ax,ax ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@18 (line 19)
0.00% │ │ ↗ 0x00007f34cc66e888: mov r11d,DWORD PTR [r9+r8*4+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
0.01% │ │ │ 0x00007f34cc66e88d: cmp r11d,edx
│ │╭│ 0x00007f34cc66e890: jg 0x00007f34cc66e8b8
│ │││↗ 0x00007f34cc66e892: inc r8d ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││││ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
│ ││││ 0x00007f34cc66e895: cmp r8d,ebp
│ ││╰│ 0x00007f34cc66e898: jl 0x00007f34cc66e888 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
│ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@15 (line 18)
│ ↘│ │↗ 0x00007f34cc66e89a: test r10,r10
0.00% │ │ ││ 0x00007f34cc66e89d: je 0x00007f34cc66e8da
│ │ ││ 0x00007f34cc66e89f: mov rsi,r10
│ │ ││ 0x00007f34cc66e8a2: nop
│ │ ││ 0x00007f34cc66e8a3: call 0x00007f34cc66e920 ; ImmutableOopMap{}
│ │ ││ ;*invokevirtual consume {reexecute=0 rethrow=0 return_oop=0}
│ │ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if@39 (line 24)
│ │ ││ ; {optimized virtual_call}
0.00% │ │ ││ 0x00007f34cc66e8a8: add rsp,0x20
0.01% │ │ ││ 0x00007f34cc66e8ac: pop rbp
│ │ ││ 0x00007f34cc66e8ad: mov r10,QWORD PTR [r15+0x108]
│ │ ││ 0x00007f34cc66e8b4: test DWORD PTR [r10],eax ; {poll_return}
│ │ ││ 0x00007f34cc66e8b7: ret
│ ↘ ││ 0x00007f34cc66e8b8: mov edx,r11d
│ ╰│ 0x00007f34cc66e8bb: jmp 0x00007f34cc66e892
│ │ 0x00007f34cc66e8bd: mov edx,0x80000000
│ ╰ 0x00007f34cc66e8c2: jmp 0x00007f34cc66e89a
↘ 0x00007f34cc66e8c4: mov esi,0xffffff7e
0x00007f34cc66e8c9: mov QWORD PTR [rsp],r10
0x00007f34cc66e8cd: mov DWORD PTR [rsp+0x8],r9d
....................................................................................................
观察:
findMax_if
和 findMax_if_else
之间只有一个显着差异:
0x00007f34cc66e85e: jg 0x00007f34cc66e821
对比 0x00007fc7a8671ade: jle 0x00007fc7a8671ab0
findMax_intrinsicMax
哪个 laverage intrinsic Math.max
性能最差,这对我来说是违反直觉的。
问题:
- 添加
else
包含不改变任何代码的语句(如 x = x;
)是否正常?特别是在一个线程上执行的代码中。
- 吞吐量差异的真正来源在哪里?我看到
jg
(如果大于则跳转)不是 jle
(如果小于或等于则跳转)。实际上,第一个条件是倒转的第二个条件。
- 如果简单的
if else
语句具有更高的吞吐量,使用 Math.max
有什么意义?
run_tests.sh
运行基准测试并生成绘图。
首先,为了尽量减少不相关的ASM代码量和简化分析,让我们添加以下JVM选项:
-XX:LoopUnrollLimit=0
- 关闭循环展开;
-XX:-UseCountedLoopSafepoints
- 从循环中消除安全点轮询。
现在 if_else
的性能差异会更大,而结果组装会更简单。这是两个基准测试的循环体。
findMax_if
╭ 0x0000029707af78f5: jmp 29707af7908h
│ ↗ 0x0000029707af78f7: mov r8d,ecx
│ │ 0x0000029707af78fa: nop word ptr [rax+rax+0h]
0,66% │ │↗ 0x0000029707af7900: inc r9d ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
1,02% │ ││ 0x0000029707af7903: cmp r9d,r10d
│╭││ 0x0000029707af7906: jnl 29707af7914h ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
││││ ; - codes.dbg.FindMaxBenchmark::findMax_if@18 (line 19)
2,06% ↘│││ 0x0000029707af7908: mov ecx,dword ptr [r11+r9*4+10h]
│││ ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│││ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
50,86% │││ 0x0000029707af790d: cmp ecx,r8d
0,02% │╰│ 0x0000029707af7910: jnle 29707af78f7h ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@23 (line 19)
41,01% │ ╰ 0x0000029707af7912: jmp 29707af7900h ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
│ ; - codes.dbg.FindMaxBenchmark::findMax_if@15 (line 18)
↘ 0x0000029707af7914: test rbx,rbx
findMax_if_else
╭ 0x00000137d24d4b75: jmp 137d24d4b88h
│ ↗ 0x00000137d24d4b77: mov r8d,ecx
│ │ 0x00000137d24d4b7a: nop word ptr [rax+rax+0h]
72,63% │ ↗│ 0x00000137d24d4b80: inc r9d ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
0,05% │ ││ 0x00000137d24d4b83: cmp r9d,r10d
0,01% │╭││ 0x00000137d24d4b86: jnl 137d24d4b94h ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
││││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@18 (line 34)
6,47% ↘│││ 0x00000137d24d4b88: mov ecx,dword ptr [r11+r9*4+10h]
│││ ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
15,93% │││ 0x00000137d24d4b8d: cmp ecx,r8d
0,18% │╰│ 0x00000137d24d4b90: jle 137d24d4b80h ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@23 (line 34)
0,01% │ ╰ 0x00000137d24d4b92: jmp 137d24d4b77h ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
│ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@15 (line 33)
↘ 0x00000137d24d4b94: test rbx,rbx
这与您的发现一致:两个编译之间的唯一区别是反向跳转条件:jnle
与 jle
。为什么 jnle
变体那么慢?
如果我们仔细查看基准代码,我们会发现当前最大值发生变化的点很少发生。平均而言,data[i] > result
在整个循环中只有 14 次为真。这意味着,jnle
分支只用了 0.001% 的时间,其余 99.999% 的时间执行下一条 jmp
指令。
相反,第二个变体中的jle
指令占用了99.999%的时间,执行几乎没有达到下面的jmp
。因此,第一个循环每次迭代退出 7 条指令,而第二个循环仅退出 6 条指令。
JMH 有 built-in perfnorm
分析器(可在 Linux 上使用)用 CPU 性能计数器统计数据补充基准测试结果。让我们 运行 它与 -prof perfnorm
.
Benchmark Mode Cnt Score Error Units
FindMaxBenchmark.findMax_if thrpt 10 1447.576 ± 8.854 ops/s
FindMaxBenchmark.findMax_if:CPI thrpt 0.335 #/op
FindMaxBenchmark.findMax_if:L1-dcache-load-misses thrpt 63971.361 #/op
FindMaxBenchmark.findMax_if:L1-dcache-loads thrpt 1014974.522 #/op
FindMaxBenchmark.findMax_if:L1-dcache-stores thrpt 6105.121 #/op
FindMaxBenchmark.findMax_if:L1-icache-load-misses thrpt 1641.074 #/op
FindMaxBenchmark.findMax_if:branch-misses thrpt 146.305 #/op
FindMaxBenchmark.findMax_if:branches thrpt 3006620.048 #/op
FindMaxBenchmark.findMax_if:cycles thrpt 2358093.526 #/op
FindMaxBenchmark.findMax_if:dTLB-load-misses thrpt 1085.740 #/op
FindMaxBenchmark.findMax_if:dTLB-loads thrpt 1012739.362 #/op
FindMaxBenchmark.findMax_if:dTLB-store-misses thrpt 21.985 #/op
FindMaxBenchmark.findMax_if:dTLB-stores thrpt 6146.243 #/op
FindMaxBenchmark.findMax_if:iTLB-load-misses thrpt 139.741 #/op
FindMaxBenchmark.findMax_if:iTLB-loads thrpt 42.031 #/op
FindMaxBenchmark.findMax_if:instructions thrpt 7039394.622 #/op
FindMaxBenchmark.findMax_if_else thrpt 10 2472.400 ± 36.958 ops/s
FindMaxBenchmark.findMax_if_else:CPI thrpt 0.229 #/op
FindMaxBenchmark.findMax_if_else:L1-dcache-load-misses thrpt 63353.481 #/op
FindMaxBenchmark.findMax_if_else:L1-dcache-loads thrpt 1007856.753 #/op
FindMaxBenchmark.findMax_if_else:L1-dcache-stores thrpt 3696.805 #/op
FindMaxBenchmark.findMax_if_else:L1-icache-load-misses thrpt 1182.253 #/op
FindMaxBenchmark.findMax_if_else:branch-misses thrpt 72.334 #/op
FindMaxBenchmark.findMax_if_else:branches thrpt 2000460.845 #/op
FindMaxBenchmark.findMax_if_else:cycles thrpt 1380927.546 #/op
FindMaxBenchmark.findMax_if_else:dTLB-load-misses thrpt 845.629 #/op
FindMaxBenchmark.findMax_if_else:dTLB-loads thrpt 1006135.685 #/op
FindMaxBenchmark.findMax_if_else:dTLB-store-misses thrpt 13.336 #/op
FindMaxBenchmark.findMax_if_else:dTLB-stores thrpt 3545.950 #/op
FindMaxBenchmark.findMax_if_else:iTLB-load-misses thrpt 80.233 #/op
FindMaxBenchmark.findMax_if_else:iTLB-loads thrpt 19.009 #/op
FindMaxBenchmark.findMax_if_else:instructions thrpt 6018937.376 #/op
性能计数器确认 findMax_if
执行了 7M 条指令和 3M 分支,而 findMax_if_else
执行了 6M 条指令和 2M 分支。我想现在很清楚差异来自哪里,那么其他问题呢?
Is normal to add else statement containing code which doesn't change
anything
我不这么认为。至少因为这看起来违反直觉,并且使代码更难阅读和理解。冗余代码很好地反转了分支条件,这只是运气问题。将您的随机数组替换为已排序的数组,这样 data[i] > result
将大部分为真,然后 findMax_if
将成为最快的选择。
What is the point of using Math.max if simple if else statement has
higher throughput?
同样,这并不总是正确的。这在很大程度上取决于数据的性质。当分支很容易预测时,if
语句表现更好。但是一旦分支预测器开始经常失败,性能就会急剧下降。 Math.max
作为JVM内部方法,被翻译成无分支cmov
指令,具有不受数据分布影响性能稳定的优点。
这是一个示例数据集,其中 Math.max
大大优于所有其他选项:
public void setup() {
Random r = new Random();
this.tab = r.ints(SIZE).sorted().toArray();
for (int i = 0; i < tab.length; i += ThreadLocalRandom.current().nextInt(3)) {
tab[i] = 0;
}
}
我注意到 if else
/ 三元 (condition ? a : b
) 赋值比 if
only 语句中的条件赋值更快。我在不同的 JDK 上执行了 JMH 基准测试,但我将专注于 JDK 12.
(ops / sec, 越高越好)
源代码:
@State(Scope.Benchmark)
public class FindMaxBenchmark {
public static int SIZE = 1_000_000;
@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public static void findMax_if(Blackhole bh, Mock mock) {
int result = Integer.MIN_VALUE;
int[] data = mock.tab;
for (int i = 0; i < data.length; i++) {
if (data[i] > result) {
result = data[i];
}
}
bh.consume(result);
}
@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public static void findMax_if_else(Blackhole bh, Mock mock) {
int result = Integer.MIN_VALUE;
int[] data = mock.tab;
for (int i = 0; i < data.length; i++) {
if (data[i] > result) {
result = data[i];
} else {
result = result;
}
}
bh.consume(result);
}
@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public static void findMax_ternary(Blackhole bh, Mock mock) {
int result = Integer.MIN_VALUE;
int[] data = mock.tab;
for (int i = 0; i < data.length; i++) {
result = data[i] > result ? data[i] : result;
}
bh.consume(result);
}
@Benchmark
@CompilerControl(CompilerControl.Mode.DONT_INLINE)
public static void findMax_intrinsicMax(Blackhole bh, Mock mock) {
int result = Integer.MIN_VALUE;
int[] data = mock.tab;
for (int i = 0; i < data.length; i++) {
result = Math.max(data[i], result);
}
bh.consume(result);
}
@State(Scope.Thread)
public static class Mock {
private int[] tab = new int[SIZE];
public int[] getTab() {
return tab;
}
@Setup(Level.Iteration)
public void setup() {
Random r = new Random();
this.tab = r.ints(SIZE).toArray();
}
}
}
findMax_if_else
perfasm输出(三进制差不多):
c2, level 4, codes.dbg.FindMaxBenchmark::findMax_if_else, version 493 (165 bytes)
0x00007fc7a8671a6b: cmp r8d,ebp
╭ 0x00007fc7a8671a6e: jae 0x00007fc7a8671b3d
│ 0x00007fc7a8671a74: mov edx,DWORD PTR [r9+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
│ 0x00007fc7a8671a78: cmp edx,0x80000000
│╭ 0x00007fc7a8671a7e: jg 0x00007fc7a8671a85 ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@23 (line 34)
││ 0x00007fc7a8671a80: mov edx,0x80000000 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
│↘ 0x00007fc7a8671a85: mov ebx,ebp
0.02% │ 0x00007fc7a8671a87: add ebx,0xfffffffd
│ 0x00007fc7a8671a8a: cmp r8d,ebx
│ 0x00007fc7a8671a8d: cmovl ebx,r11d
│ 0x00007fc7a8671a91: mov r8d,0x1
0.00% │ 0x00007fc7a8671a97: cmp ebx,0x1
│ ╭ 0x00007fc7a8671a9a: jle 0x00007fc7a8671b00
│ │ 0x00007fc7a8671a9c: mov rdi,r9 ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@39 (line 33)
│ │╭ 0x00007fc7a8671a9f: jmp 0x00007fc7a8671ab9
0.01% │ ││ ↗ 0x00007fc7a8671aa1: mov edx,ecx
│ ││ │ 0x00007fc7a8671aa3: nop DWORD PTR [rax+0x0]
│ ││ │ 0x00007fc7a8671aaa: nop WORD PTR [rax+rax*1+0x0]
8.06% │ ││ ↗│ 0x00007fc7a8671ab0: add r8d,0x4 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
11.38% │ ││ ││ 0x00007fc7a8671ab4: cmp r8d,ebx
13.63% │ ││╭ ││ 0x00007fc7a8671ab7: jge 0x00007fc7a8671af1 ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
│ │││ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@18 (line 34)
3.02% │ │↘│ ││ ↗ 0x00007fc7a8671ab9: mov r11d,DWORD PTR [r9+r8*4+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
8.53% │ │ │ ││ │ 0x00007fc7a8671abe: cmp r11d,edx
4.54% │ │ │╭ ││ │ 0x00007fc7a8671ac1: jg 0x00007fc7a8671ae2 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ ││ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
4.96% │ │ ││ ││↗ │ 0x00007fc7a8671ac3: mov r11d,DWORD PTR [r9+r8*4+0x14] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ ││ │││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
3.73% │ │ ││ │││ │ 0x00007fc7a8671ac8: cmp r11d,edx
9.19% │ │ ││╭ │││ │ 0x00007fc7a8671acb: jg 0x00007fc7a8671ae7 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ │││ │││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
3.70% │ │ │││ │││↗ │ 0x00007fc7a8671acd: mov r11d,DWORD PTR [r9+r8*4+0x18] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │││ ││││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
4.96% │ │ │││ ││││ │ 0x00007fc7a8671ad2: cmp r11d,edx
4.45% │ │ │││╭││││ │ 0x00007fc7a8671ad5: jg 0x00007fc7a8671aec ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││││││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
8.55% │ │ ││││││││↗│ 0x00007fc7a8671ad7: mov ecx,DWORD PTR [r9+r8*4+0x1c] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││││││││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
6.11% │ │ ││││││││││ 0x00007fc7a8671adc: cmp ecx,edx
2.48% │ │ ││││╰│││││ 0x00007fc7a8671ade: jle 0x00007fc7a8671ab0 ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││ │││││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@23 (line 34)
│ │ ││││ ╰││││ 0x00007fc7a8671ae0: jmp 0x00007fc7a8671aa1
│ │ │↘││ ││││ 0x00007fc7a8671ae2: mov edx,r11d
0.00% │ │ │ ││ ╰│││ 0x00007fc7a8671ae5: jmp 0x00007fc7a8671ac3
0.00% │ │ │ ↘│ │││ 0x00007fc7a8671ae7: mov edx,r11d
0.00% │ │ │ │ ╰││ 0x00007fc7a8671aea: jmp 0x00007fc7a8671acd
0.00% │ │ │ ↘ ││ 0x00007fc7a8671aec: mov edx,r11d
0.00% │ │ │ ╰│ 0x00007fc7a8671aef: jmp 0x00007fc7a8671ad7
│ │ ↘ │ 0x00007fc7a8671af1: mov r11,QWORD PTR [r15+0x108] ; ImmutableOopMap{r10=Oop r9=NarrowOop rdi=Oop }
│ │ │ ;*goto {reexecute=1 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@39 (line 33)
│ │ │ 0x00007fc7a8671af8: test DWORD PTR [r11],eax ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@39 (line 33)
│ │ │ ; {poll}
│ │ │ 0x00007fc7a8671afb: cmp r8d,ebx
0.00% │ │ ╰ 0x00007fc7a8671afe: jl 0x00007fc7a8671ab9
│ ↘ 0x00007fc7a8671b00: cmp r8d,ebp
0.00% │ ╭ 0x00007fc7a8671b03: jge 0x00007fc7a8671b1a
│ │ 0x00007fc7a8671b05: data16 xchg ax,ax ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@18 (line 34)
│ │ ↗ 0x00007fc7a8671b08: mov r11d,DWORD PTR [r9+r8*4+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
0.01% │ │ │ 0x00007fc7a8671b0d: cmp r11d,edx
│ │╭│ 0x00007fc7a8671b10: jg 0x00007fc7a8671b38
│ │││↗ 0x00007fc7a8671b12: inc r8d ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
│ ││││ 0x00007fc7a8671b15: cmp r8d,ebp
│ ││╰│ 0x00007fc7a8671b18: jl 0x00007fc7a8671b08 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
│ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@15 (line 33)
│ ↘│ │ 0x00007fc7a8671b1a: test r10,r10
0.00% │ │ │ 0x00007fc7a8671b1d: je 0x00007fc7a8671b52
│ │ │ 0x00007fc7a8671b1f: mov rsi,r10
│ │ │ 0x00007fc7a8671b22: nop
│ │ │ 0x00007fc7a8671b23: call 0x00007fc7a8671ba0 ; ImmutableOopMap{}
│ │ │ ;*invokevirtual consume {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@44 (line 41)
│ │ │ ; {optimized virtual_call}
│ │ │ 0x00007fc7a8671b28: add rsp,0x20
0.01% │ │ │ 0x00007fc7a8671b2c: pop rbp
│ │ │ 0x00007fc7a8671b2d: mov r10,QWORD PTR [r15+0x108]
│ │ │ 0x00007fc7a8671b34: test DWORD PTR [r10],eax ; {poll_return}
│ │ │ 0x00007fc7a8671b37: ret
│ ↘ │ 0x00007fc7a8671b38: mov edx,r11d
│ ╰ 0x00007fc7a8671b3b: jmp 0x00007fc7a8671b12
↘ 0x00007fc7a8671b3d: mov esi,0xffffff7e
0x00007fc7a8671b42: mov QWORD PTR [rsp],r10
0x00007fc7a8671b46: mov DWORD PTR [rsp+0x8],r9d
0x00007fc7a8671b4b: call 0x00007fc7a0ba3d00 ; ImmutableOopMap{[0]=Oop [8]=NarrowOop }
;*if_icmpge {reexecute=1 rethrow=0 return_oop=0}
findMax_if
性能输出:
c2, level 4, codes.dbg.FindMaxBenchmark::findMax_if, version 480 (165 bytes)
0x00007f34cc66e7eb: cmp r8d,ebp
╭ 0x00007f34cc66e7ee: jae 0x00007f34cc66e8c4
│ 0x00007f34cc66e7f4: mov edx,DWORD PTR [r9+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
│ 0x00007f34cc66e7f8: cmp edx,0x80000000
│╭ 0x00007f34cc66e7fe: jg 0x00007f34cc66e805 ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
││ ; - codes.dbg.FindMaxBenchmark::findMax_if@23 (line 19)
││ 0x00007f34cc66e800: mov edx,0x80000000 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
││ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
│↘ 0x00007f34cc66e805: mov ebx,ebp
0.01% │ 0x00007f34cc66e807: add ebx,0xfffffffd
│ 0x00007f34cc66e80a: cmp r8d,ebx
│ 0x00007f34cc66e80d: cmovl ebx,r11d
│ 0x00007f34cc66e811: mov r8d,0x1
│ 0x00007f34cc66e817: cmp ebx,0x1
│ ╭ 0x00007f34cc66e81a: jle 0x00007f34cc66e880
│ │ 0x00007f34cc66e81c: mov rdi,r9 ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@34 (line 18)
│ │╭ 0x00007f34cc66e81f: jmp 0x00007f34cc66e839
│ ││ ↗ 0x00007f34cc66e821: mov edx,ecx
0.00% │ ││ │ 0x00007f34cc66e823: nop DWORD PTR [rax+0x0]
│ ││ │ 0x00007f34cc66e82a: nop WORD PTR [rax+rax*1+0x0]
0.89% │ ││ │↗ 0x00007f34cc66e830: add r8d,0x4 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
12.36% │ ││ ││ 0x00007f34cc66e834: cmp r8d,ebx
0.11% │ ││╭ ││ 0x00007f34cc66e837: jge 0x00007f34cc66e871 ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
│ │││ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if@18 (line 19)
9.94% │ │↘│ ││ ↗ 0x00007f34cc66e839: mov r11d,DWORD PTR [r9+r8*4+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
0.11% │ │ │ ││ │ 0x00007f34cc66e83e: cmp r11d,edx
10.05% │ │ │╭ ││ │ 0x00007f34cc66e841: jg 0x00007f34cc66e862 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ ││ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
0.13% │ │ ││ ││↗ │ 0x00007f34cc66e843: mov r11d,DWORD PTR [r9+r8*4+0x14] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ ││ │││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
9.84% │ │ ││ │││ │ 0x00007f34cc66e848: cmp r11d,edx
0.11% │ │ ││╭ │││ │ 0x00007f34cc66e84b: jg 0x00007f34cc66e867 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ │││ │││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
10.02% │ │ │││ │││↗ │ 0x00007f34cc66e84d: mov r11d,DWORD PTR [r9+r8*4+0x18] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │││ ││││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
0.33% │ │ │││ ││││ │ 0x00007f34cc66e852: cmp r11d,edx
23.63% │ │ │││╭││││ │ 0x00007f34cc66e855: jg 0x00007f34cc66e86c ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││││││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
0.13% │ │ ││││││││↗│ 0x00007f34cc66e857: mov ecx,DWORD PTR [r9+r8*4+0x1c] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││││││││ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
9.89% │ │ ││││││││││ 0x00007f34cc66e85c: cmp ecx,edx
0.11% │ │ ││││╰│││││ 0x00007f34cc66e85e: jg 0x00007f34cc66e821 ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
│ │ ││││ │││││ ; - codes.dbg.FindMaxBenchmark::findMax_if@23 (line 19)
9.71% │ │ ││││ ╰││││ 0x00007f34cc66e860: jmp 0x00007f34cc66e830
│ │ │↘││ ││││ 0x00007f34cc66e862: mov edx,r11d
0.00% │ │ │ ││ ╰│││ 0x00007f34cc66e865: jmp 0x00007f34cc66e843
│ │ │ ↘│ │││ 0x00007f34cc66e867: mov edx,r11d
0.00% │ │ │ │ ╰││ 0x00007f34cc66e86a: jmp 0x00007f34cc66e84d
│ │ │ ↘ ││ 0x00007f34cc66e86c: mov edx,r11d
0.00% │ │ │ ╰│ 0x00007f34cc66e86f: jmp 0x00007f34cc66e857
│ │ ↘ │ 0x00007f34cc66e871: mov r11,QWORD PTR [r15+0x108] ; ImmutableOopMap{r10=Oop r9=NarrowOop rdi=Oop }
│ │ │ ;*goto {reexecute=1 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@34 (line 18)
0.00% │ │ │ 0x00007f34cc66e878: test DWORD PTR [r11],eax ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@34 (line 18)
│ │ │ ; {poll}
│ │ │ 0x00007f34cc66e87b: cmp r8d,ebx
│ │ ╰ 0x00007f34cc66e87e: jl 0x00007f34cc66e839
│ ↘ 0x00007f34cc66e880: cmp r8d,ebp
0.00% │ ╭ 0x00007f34cc66e883: jge 0x00007f34cc66e89a
│ │ 0x00007f34cc66e885: data16 xchg ax,ax ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@18 (line 19)
0.00% │ │ ↗ 0x00007f34cc66e888: mov r11d,DWORD PTR [r9+r8*4+0x10] ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│ │ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
0.01% │ │ │ 0x00007f34cc66e88d: cmp r11d,edx
│ │╭│ 0x00007f34cc66e890: jg 0x00007f34cc66e8b8
│ │││↗ 0x00007f34cc66e892: inc r8d ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││││ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
│ ││││ 0x00007f34cc66e895: cmp r8d,ebp
│ ││╰│ 0x00007f34cc66e898: jl 0x00007f34cc66e888 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
│ ││ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@15 (line 18)
│ ↘│ │↗ 0x00007f34cc66e89a: test r10,r10
0.00% │ │ ││ 0x00007f34cc66e89d: je 0x00007f34cc66e8da
│ │ ││ 0x00007f34cc66e89f: mov rsi,r10
│ │ ││ 0x00007f34cc66e8a2: nop
│ │ ││ 0x00007f34cc66e8a3: call 0x00007f34cc66e920 ; ImmutableOopMap{}
│ │ ││ ;*invokevirtual consume {reexecute=0 rethrow=0 return_oop=0}
│ │ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if@39 (line 24)
│ │ ││ ; {optimized virtual_call}
0.00% │ │ ││ 0x00007f34cc66e8a8: add rsp,0x20
0.01% │ │ ││ 0x00007f34cc66e8ac: pop rbp
│ │ ││ 0x00007f34cc66e8ad: mov r10,QWORD PTR [r15+0x108]
│ │ ││ 0x00007f34cc66e8b4: test DWORD PTR [r10],eax ; {poll_return}
│ │ ││ 0x00007f34cc66e8b7: ret
│ ↘ ││ 0x00007f34cc66e8b8: mov edx,r11d
│ ╰│ 0x00007f34cc66e8bb: jmp 0x00007f34cc66e892
│ │ 0x00007f34cc66e8bd: mov edx,0x80000000
│ ╰ 0x00007f34cc66e8c2: jmp 0x00007f34cc66e89a
↘ 0x00007f34cc66e8c4: mov esi,0xffffff7e
0x00007f34cc66e8c9: mov QWORD PTR [rsp],r10
0x00007f34cc66e8cd: mov DWORD PTR [rsp+0x8],r9d
....................................................................................................
观察:
findMax_if
和findMax_if_else
之间只有一个显着差异:
0x00007f34cc66e85e: jg 0x00007f34cc66e821
对比 0x00007fc7a8671ade: jle 0x00007fc7a8671ab0
findMax_intrinsicMax
哪个 laverage intrinsicMath.max
性能最差,这对我来说是违反直觉的。
问题:
- 添加
else
包含不改变任何代码的语句(如x = x;
)是否正常?特别是在一个线程上执行的代码中。 - 吞吐量差异的真正来源在哪里?我看到
jg
(如果大于则跳转)不是jle
(如果小于或等于则跳转)。实际上,第一个条件是倒转的第二个条件。 - 如果简单的
if else
语句具有更高的吞吐量,使用Math.max
有什么意义?
run_tests.sh
运行基准测试并生成绘图。
首先,为了尽量减少不相关的ASM代码量和简化分析,让我们添加以下JVM选项:
-XX:LoopUnrollLimit=0
- 关闭循环展开;-XX:-UseCountedLoopSafepoints
- 从循环中消除安全点轮询。
现在 if_else
的性能差异会更大,而结果组装会更简单。这是两个基准测试的循环体。
findMax_if
╭ 0x0000029707af78f5: jmp 29707af7908h
│ ↗ 0x0000029707af78f7: mov r8d,ecx
│ │ 0x0000029707af78fa: nop word ptr [rax+rax+0h]
0,66% │ │↗ 0x0000029707af7900: inc r9d ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if@31 (line 18)
1,02% │ ││ 0x0000029707af7903: cmp r9d,r10d
│╭││ 0x0000029707af7906: jnl 29707af7914h ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
││││ ; - codes.dbg.FindMaxBenchmark::findMax_if@18 (line 19)
2,06% ↘│││ 0x0000029707af7908: mov ecx,dword ptr [r11+r9*4+10h]
│││ ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│││ ; - codes.dbg.FindMaxBenchmark::findMax_if@21 (line 19)
50,86% │││ 0x0000029707af790d: cmp ecx,r8d
0,02% │╰│ 0x0000029707af7910: jnle 29707af78f7h ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if@23 (line 19)
41,01% │ ╰ 0x0000029707af7912: jmp 29707af7900h ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
│ ; - codes.dbg.FindMaxBenchmark::findMax_if@15 (line 18)
↘ 0x0000029707af7914: test rbx,rbx
findMax_if_else
╭ 0x00000137d24d4b75: jmp 137d24d4b88h
│ ↗ 0x00000137d24d4b77: mov r8d,ecx
│ │ 0x00000137d24d4b7a: nop word ptr [rax+rax+0h]
72,63% │ ↗│ 0x00000137d24d4b80: inc r9d ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@36 (line 33)
0,05% │ ││ 0x00000137d24d4b83: cmp r9d,r10d
0,01% │╭││ 0x00000137d24d4b86: jnl 137d24d4b94h ;*aload_3 {reexecute=0 rethrow=0 return_oop=0}
││││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@18 (line 34)
6,47% ↘│││ 0x00000137d24d4b88: mov ecx,dword ptr [r11+r9*4+10h]
│││ ;*iaload {reexecute=0 rethrow=0 return_oop=0}
│││ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@21 (line 34)
15,93% │││ 0x00000137d24d4b8d: cmp ecx,r8d
0,18% │╰│ 0x00000137d24d4b90: jle 137d24d4b80h ;*if_icmple {reexecute=0 rethrow=0 return_oop=0}
│ │ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@23 (line 34)
0,01% │ ╰ 0x00000137d24d4b92: jmp 137d24d4b77h ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0}
│ ; - codes.dbg.FindMaxBenchmark::findMax_if_else@15 (line 33)
↘ 0x00000137d24d4b94: test rbx,rbx
这与您的发现一致:两个编译之间的唯一区别是反向跳转条件:jnle
与 jle
。为什么 jnle
变体那么慢?
如果我们仔细查看基准代码,我们会发现当前最大值发生变化的点很少发生。平均而言,data[i] > result
在整个循环中只有 14 次为真。这意味着,jnle
分支只用了 0.001% 的时间,其余 99.999% 的时间执行下一条 jmp
指令。
相反,第二个变体中的jle
指令占用了99.999%的时间,执行几乎没有达到下面的jmp
。因此,第一个循环每次迭代退出 7 条指令,而第二个循环仅退出 6 条指令。
JMH 有 built-in perfnorm
分析器(可在 Linux 上使用)用 CPU 性能计数器统计数据补充基准测试结果。让我们 运行 它与 -prof perfnorm
.
Benchmark Mode Cnt Score Error Units
FindMaxBenchmark.findMax_if thrpt 10 1447.576 ± 8.854 ops/s
FindMaxBenchmark.findMax_if:CPI thrpt 0.335 #/op
FindMaxBenchmark.findMax_if:L1-dcache-load-misses thrpt 63971.361 #/op
FindMaxBenchmark.findMax_if:L1-dcache-loads thrpt 1014974.522 #/op
FindMaxBenchmark.findMax_if:L1-dcache-stores thrpt 6105.121 #/op
FindMaxBenchmark.findMax_if:L1-icache-load-misses thrpt 1641.074 #/op
FindMaxBenchmark.findMax_if:branch-misses thrpt 146.305 #/op
FindMaxBenchmark.findMax_if:branches thrpt 3006620.048 #/op
FindMaxBenchmark.findMax_if:cycles thrpt 2358093.526 #/op
FindMaxBenchmark.findMax_if:dTLB-load-misses thrpt 1085.740 #/op
FindMaxBenchmark.findMax_if:dTLB-loads thrpt 1012739.362 #/op
FindMaxBenchmark.findMax_if:dTLB-store-misses thrpt 21.985 #/op
FindMaxBenchmark.findMax_if:dTLB-stores thrpt 6146.243 #/op
FindMaxBenchmark.findMax_if:iTLB-load-misses thrpt 139.741 #/op
FindMaxBenchmark.findMax_if:iTLB-loads thrpt 42.031 #/op
FindMaxBenchmark.findMax_if:instructions thrpt 7039394.622 #/op
FindMaxBenchmark.findMax_if_else thrpt 10 2472.400 ± 36.958 ops/s
FindMaxBenchmark.findMax_if_else:CPI thrpt 0.229 #/op
FindMaxBenchmark.findMax_if_else:L1-dcache-load-misses thrpt 63353.481 #/op
FindMaxBenchmark.findMax_if_else:L1-dcache-loads thrpt 1007856.753 #/op
FindMaxBenchmark.findMax_if_else:L1-dcache-stores thrpt 3696.805 #/op
FindMaxBenchmark.findMax_if_else:L1-icache-load-misses thrpt 1182.253 #/op
FindMaxBenchmark.findMax_if_else:branch-misses thrpt 72.334 #/op
FindMaxBenchmark.findMax_if_else:branches thrpt 2000460.845 #/op
FindMaxBenchmark.findMax_if_else:cycles thrpt 1380927.546 #/op
FindMaxBenchmark.findMax_if_else:dTLB-load-misses thrpt 845.629 #/op
FindMaxBenchmark.findMax_if_else:dTLB-loads thrpt 1006135.685 #/op
FindMaxBenchmark.findMax_if_else:dTLB-store-misses thrpt 13.336 #/op
FindMaxBenchmark.findMax_if_else:dTLB-stores thrpt 3545.950 #/op
FindMaxBenchmark.findMax_if_else:iTLB-load-misses thrpt 80.233 #/op
FindMaxBenchmark.findMax_if_else:iTLB-loads thrpt 19.009 #/op
FindMaxBenchmark.findMax_if_else:instructions thrpt 6018937.376 #/op
性能计数器确认 findMax_if
执行了 7M 条指令和 3M 分支,而 findMax_if_else
执行了 6M 条指令和 2M 分支。我想现在很清楚差异来自哪里,那么其他问题呢?
Is normal to add else statement containing code which doesn't change anything
我不这么认为。至少因为这看起来违反直觉,并且使代码更难阅读和理解。冗余代码很好地反转了分支条件,这只是运气问题。将您的随机数组替换为已排序的数组,这样 data[i] > result
将大部分为真,然后 findMax_if
将成为最快的选择。
What is the point of using Math.max if simple if else statement has higher throughput?
同样,这并不总是正确的。这在很大程度上取决于数据的性质。当分支很容易预测时,if
语句表现更好。但是一旦分支预测器开始经常失败,性能就会急剧下降。 Math.max
作为JVM内部方法,被翻译成无分支cmov
指令,具有不受数据分布影响性能稳定的优点。
这是一个示例数据集,其中 Math.max
大大优于所有其他选项:
public void setup() {
Random r = new Random();
this.tab = r.ints(SIZE).sorted().toArray();
for (int i = 0; i < tab.length; i += ThreadLocalRandom.current().nextInt(3)) {
tab[i] = 0;
}
}