如果在 Intel Skylake CPU 上作为函数调用，为什么我的空循环运行快两倍？

Question

我正在运行进行一些测试，将 C 与 Java 和运行进行比较，发现一些有趣的东西。运行我在 main 调用的函数中使用优化级别 1 (-O1) 的完全相同的基准代码，而不是在 main 本身中，导致大约两倍的性能。我正在打印 test_t 的大小，以毫无疑问地验证代码是否正在编译为 x64。

我将可执行文件发送给了运行 i7-7700HQ 的朋友，得到了类似的结果。我是运行 i7-6700。

这是较慢的代码：

#include <stdio.h>
#include <time.h>
#include <stdint.h>

int main() {
    printf("Size = %I64u\n", sizeof(size_t));
    int start = clock();
    for(int64_t i = 0; i < 10000000000L; i++) {
        
    }
    printf("%ld\n", clock() - start);
    return 0;
}

而且越快：

#include <stdio.h>
#include <time.h>
#include <stdint.h>

void test() {
    printf("Size = %I64u\n", sizeof(size_t));
    int start = clock();
    for(int64_t i = 0; i < 10000000000L; i++) {
        
    }
    printf("%ld\n", clock() - start);
}

int main() {
    test();
    return 0;
}

我还会提供汇编代码供您深入研究。我不知道组装。较慢：

    .file   "dummy.c"
    .text
    .def    __main; .scl    2;  .type   32; .endef
    .section .rdata,"dr"
.LC0:
    .ascii "Size = %I64u[=13=]"
.LC1:
    .ascii "%ld[=13=]"
    .text
    .globl  main
    .def    main;   .scl    2;  .type   32; .endef
    .seh_proc   main
main:
    pushq   %rbx
    .seh_pushreg    %rbx
    subq    , %rsp
    .seh_stackalloc 32
    .seh_endprologue
    call    __main
    movl    , %edx
    leaq    .LC0(%rip), %rcx
    call    printf
    call    clock
    movl    %eax, %ebx
    movabsq 000000000, %rax
.L2:
    subq    , %rax
    jne .L2
    call    clock
    subl    %ebx, %eax
    movl    %eax, %edx
    leaq    .LC1(%rip), %rcx
    call    printf
    movl    [=13=], %eax
    addq    , %rsp
    popq    %rbx
    ret
    .seh_endproc
    .ident  "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 8.1.0"
    .def    printf; .scl    2;  .type   32; .endef
    .def    clock;  .scl    2;  .type   32; .endef

更快：

    .file   "dummy.c"
    .text
    .section .rdata,"dr"
.LC0:
    .ascii "Size = %I64u[=14=]"
.LC1:
    .ascii "%ld[=14=]"
    .text
    .globl  test
    .def    test;   .scl    2;  .type   32; .endef
    .seh_proc   test
test:
    pushq   %rbx
    .seh_pushreg    %rbx
    subq    , %rsp
    .seh_stackalloc 32
    .seh_endprologue
    movl    , %edx
    leaq    .LC0(%rip), %rcx
    call    printf
    call    clock
    movl    %eax, %ebx
    movabsq 000000000, %rax
.L2:
    subq    , %rax
    jne .L2
    call    clock
    subl    %ebx, %eax
    movl    %eax, %edx
    leaq    .LC1(%rip), %rcx
    call    printf
    nop
    addq    , %rsp
    popq    %rbx
    ret
    .seh_endproc
    .def    __main; .scl    2;  .type   32; .endef
    .globl  main
    .def    main;   .scl    2;  .type   32; .endef
    .seh_proc   main
main:
    subq    , %rsp
    .seh_stackalloc 40
    .seh_endprologue
    call    __main
    call    test
    movl    [=14=], %eax
    addq    , %rsp
    ret
    .seh_endproc
    .ident  "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 8.1.0"
    .def    printf; .scl    2;  .type   32; .endef
    .def    clock;  .scl    2;  .type   32; .endef

这是我的编译批处理脚本：

@echo off
set /p file= File to compile: 
del compiled.exe
gcc -Wall -Wextra -std=c17 -O1 -o compiled.exe %file%.c
compiled.exe
PAUSE

并编译为汇编：

@echo off
set /p file= File to compile: 
del %file%.s
gcc -S -Wall -Wextra -std=c17 -O1 %file%.c
PAUSE

Answer 1

慢版：

请注意，sub rax, 1 \ jne 对 正好穿过 ..80 的边界（这是一个 32 字节的边界）。这是Intels document regarding this issue中提到的案例之一，如图：

所以这个 op/branch 对是受影响（这将导致它不被缓存在 µop 缓存中）。我不确定这是否是原因，还有其他因素在起作用，但这是一回事。

在快速版本中，分支没有“触及”32 字节边界，因此不受影响。

可能还有其他适用的影响。仍然由于跨越 32 字节边界，在缓慢的情况下，循环分布在 µop 缓存中的 2 个块中，即使没有修复 JCC 错误，如果循环不能，则可能导致它在每次迭代 2 个周期时运行从 Loop Stream Detector 执行（它在某些处理器上被其他勘误表 SKL150 的其他修复程序禁用）。参见例如关于 .

的答案

为了解决各种评论说他们无法重现这一点，是的，有多种可能发生的方式：

无论哪种影响导致减速，很可能是 精确放置 跨 32 字节边界的 op/branch 对造成的，这纯属偶然。从源代码编译不太可能重现相同的情况，除非您使用与原始发布者相同的编译器和相同的设置。
即使使用相同的二进制文件，无论是哪个影响造成的，这种奇怪的影响只会发生在特定的处理器上。

如果在 Intel Skylake CPU 上作为函数调用，为什么我的空循环运行快两倍？

Why does my empty loop run twice as fast if called as a function, on Intel Skylake CPUs?

c

performance

assembly

x86-64

cpu-architecture

如果在 Intel Skylake CPU 上作为函数调用，为什么我的空循环 运行 快两倍？

Why does my empty loop run twice as fast if called as a function, on Intel Skylake CPUs?

c

performance

assembly

x86-64

cpu-architecture

如果在 Intel Skylake CPU 上作为函数调用，为什么我的空循环运行快两倍？