我的 CPU 可以在每个 CPU 周期执行多个 NOP 吗?
Can my CPU execute multiple NOPs per CPU cycle?
我写了一个简单的程序,在循环中执行一堆 NOP 指令,令我惊讶的是它每秒执行大约 10600000000 条指令,或大约 10Ghz,而我的 CPU 只有 2.2GHz。
这怎么可能? CPU 是将它们视为单个 mega-NOP,还是我只是发现了 "instruction level parallelism" 的含义?
什么是每秒指令数的更好衡量标准?执行添加指令仅达到 414900000/s,我报告的 bogomips 的十分之一 CPU:4390.03
C代码:
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#define ten(a) a a a a a a a a a a
#define hundred(a) ten(a) ten(a) ten(a) ten(a) ten(a) ten(a) ten(a) \
ten(a) ten(a) ten(a)
#define ITER 10000000
int main(void) {
uint64_t i=0;
uint64_t t=time(NULL);
while(1) {
for(int j=0; j<ITER;j++) {
hundred(asm volatile ("nop");)
}
i+=ITER*100;
printf("%lu/%lu\n", i, time(NULL)-t);
}
return 0;
}
编译的程序集:
.file "gbloopinc.c"
.section .rodata
.LC0:
.string "%lu/%lu\n"
.text
.globl main
.type main, @function
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq , %rsp
movq [=13=], -16(%rbp)
movl [=13=], %edi
call time
movq %rax, -8(%rbp)
.L4:
movl [=13=], -20(%rbp)
jmp .L2
.L3:
#APP
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
#NO_APP
addl , -20(%rbp)
.L2:
cmpl 99999, -20(%rbp)
jle .L3
addq 00000000, -16(%rbp)
movl [=13=], %edi
call time
subq -8(%rbp), %rax
movq %rax, %rdx
movq -16(%rbp), %rax
movq %rax, %rsi
movl $.LC0, %edi
movl [=13=], %eax
call printf
jmp .L4
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.2) 5.4.0 20160609"
.section .note.GNU-stack,"",@progbits
我将详细介绍 HansPassant 的评论。
现代处理器都是超标量处理器和多核处理器。很容易理解多核处理器是什么——它有多个内核。另一方面,超标量需要更多的硬件知识。 This is a stackexchange question 解释了处理器是超标量的含义。超标量处理器在同一个内核中有许多功能单元,并且流水线化程度很高。这就是为什么可以在单个内核中同时分派和执行多条指令的原因。以下是处理器中的一些功能单元:整数 addition/subtraction、浮点乘法、浮点除法、整数乘法、整数除法。
我鼓励您 Google 更多关于超标量处理器的信息,并特别查找有关您的处理器的更多信息。
这与多核无关。内核不是 "ports".
每个时钟 4 个 NOP 是超标量/无序 CPU 的 issue/retirement 流水线宽度。 NOP 甚至不需要执行单元/执行端口(ALU 或加载或存储),因此您甚至不受整数执行单元数量的限制。甚至 Core2(英特尔的第一个 4 宽 x86 CPU)也可以 运行 每个时钟 4 个 NOP。
如您所料,这是一个指令级并行的例子。 NOP 当然没有输入依赖性。
在您的 Sandybridge CPU(每个内核有 3 个 ALU 执行单元)上,您可以 运行 每个时钟 3 个 ADD 和一个加载或存储指令,因为它的流水线宽度是 4 微指令。参见 Agner Fog's microarch pdf and other links in the x86 tag wiki。在独立的 ADD 指令流上,例如
add eax, eax
add ebx, ebx
add ecx, ecx
add edx, edx
...
您会在 SnB 上看到每个时钟吞吐量大约 3 个,在整数 ALU 执行端口上出现瓶颈。 Haswell 可以 运行 每个时钟 4 个 ADD,因为它有第 4 个 ALU 执行端口可以处理非向量整数操作(和分支)。
乱序 CPUs 通常具有比执行单元数量更宽的前端和 issue/retire 宽度。一旦有空闲的执行单元,就会有更多的指令被解码并准备好执行,这会增加它们的利用率。否则,如果执行由于串行依赖性而停滞或减慢,则乱序机器只能提前看到当前正在执行的内容。 (例如 add eax,eax
/ add eax,eax
需要第一个加法的输出作为第二个加法的输入,因此每个时钟只能 运行 在一个 insn。)
我写了一个简单的程序,在循环中执行一堆 NOP 指令,令我惊讶的是它每秒执行大约 10600000000 条指令,或大约 10Ghz,而我的 CPU 只有 2.2GHz。
这怎么可能? CPU 是将它们视为单个 mega-NOP,还是我只是发现了 "instruction level parallelism" 的含义?
什么是每秒指令数的更好衡量标准?执行添加指令仅达到 414900000/s,我报告的 bogomips 的十分之一 CPU:4390.03
C代码:
#include <stdio.h>
#include <stdint.h>
#include <time.h>
#define ten(a) a a a a a a a a a a
#define hundred(a) ten(a) ten(a) ten(a) ten(a) ten(a) ten(a) ten(a) \
ten(a) ten(a) ten(a)
#define ITER 10000000
int main(void) {
uint64_t i=0;
uint64_t t=time(NULL);
while(1) {
for(int j=0; j<ITER;j++) {
hundred(asm volatile ("nop");)
}
i+=ITER*100;
printf("%lu/%lu\n", i, time(NULL)-t);
}
return 0;
}
编译的程序集:
.file "gbloopinc.c"
.section .rodata
.LC0:
.string "%lu/%lu\n"
.text
.globl main
.type main, @function
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq , %rsp
movq [=13=], -16(%rbp)
movl [=13=], %edi
call time
movq %rax, -8(%rbp)
.L4:
movl [=13=], -20(%rbp)
jmp .L2
.L3:
#APP
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
# 15 "gbloopinc.c" 1
nop
# 0 "" 2
#NO_APP
addl , -20(%rbp)
.L2:
cmpl 99999, -20(%rbp)
jle .L3
addq 00000000, -16(%rbp)
movl [=13=], %edi
call time
subq -8(%rbp), %rax
movq %rax, %rdx
movq -16(%rbp), %rax
movq %rax, %rsi
movl $.LC0, %edi
movl [=13=], %eax
call printf
jmp .L4
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.2) 5.4.0 20160609"
.section .note.GNU-stack,"",@progbits
我将详细介绍 HansPassant 的评论。
现代处理器都是超标量处理器和多核处理器。很容易理解多核处理器是什么——它有多个内核。另一方面,超标量需要更多的硬件知识。 This is a stackexchange question 解释了处理器是超标量的含义。超标量处理器在同一个内核中有许多功能单元,并且流水线化程度很高。这就是为什么可以在单个内核中同时分派和执行多条指令的原因。以下是处理器中的一些功能单元:整数 addition/subtraction、浮点乘法、浮点除法、整数乘法、整数除法。
我鼓励您 Google 更多关于超标量处理器的信息,并特别查找有关您的处理器的更多信息。
这与多核无关。内核不是 "ports".
每个时钟 4 个 NOP 是超标量/无序 CPU 的 issue/retirement 流水线宽度。 NOP 甚至不需要执行单元/执行端口(ALU 或加载或存储),因此您甚至不受整数执行单元数量的限制。甚至 Core2(英特尔的第一个 4 宽 x86 CPU)也可以 运行 每个时钟 4 个 NOP。
如您所料,这是一个指令级并行的例子。 NOP 当然没有输入依赖性。
在您的 Sandybridge CPU(每个内核有 3 个 ALU 执行单元)上,您可以 运行 每个时钟 3 个 ADD 和一个加载或存储指令,因为它的流水线宽度是 4 微指令。参见 Agner Fog's microarch pdf and other links in the x86 tag wiki。在独立的 ADD 指令流上,例如
add eax, eax
add ebx, ebx
add ecx, ecx
add edx, edx
...
您会在 SnB 上看到每个时钟吞吐量大约 3 个,在整数 ALU 执行端口上出现瓶颈。 Haswell 可以 运行 每个时钟 4 个 ADD,因为它有第 4 个 ALU 执行端口可以处理非向量整数操作(和分支)。
乱序 CPUs 通常具有比执行单元数量更宽的前端和 issue/retire 宽度。一旦有空闲的执行单元,就会有更多的指令被解码并准备好执行,这会增加它们的利用率。否则,如果执行由于串行依赖性而停滞或减慢,则乱序机器只能提前看到当前正在执行的内容。 (例如 add eax,eax
/ add eax,eax
需要第一个加法的输出作为第二个加法的输入,因此每个时钟只能 运行 在一个 insn。)