L2 HW 预取器真的有用吗?
Is L2 HW prefetcher really helpful?
我在 Whiskey Lake i7-8565U 上分析性能计数器和复制 512 KiB 数据的时间(是 L2 缓存大小的两倍),遇到了一些关于 L2 的误解硬件预取器的工作。
在 Intel Manual Vol.4 MSR 中有 MSR 0x1A4
的位 0 用于控制 L2 硬件预取器(1 禁用)。
考虑以下基准:
memcopy.h
:
void *avx_memcpy_forward_lsls(void *restrict, const void *restrict, size_t);
memcopy.S
:
avx_memcpy_forward_lsls:
shr rdx, 0x3
xor rcx, rcx
avx_memcpy_forward_loop_lsls:
vmovdqa ymm0, [rsi + 8*rcx]
vmovdqa [rdi + rcx*8], ymm0
vmovdqa ymm1, [rsi + 8*rcx + 0x20]
vmovdqa [rdi + rcx*8 + 0x20], ymm1
add rcx, 0x08
cmp rdx, rcx
ja avx_memcpy_forward_loop_lsls
ret
main.c
:
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include <x86intrin.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include "memcopy.h"
#define ITERATIONS 1000
#define BUF_SIZE 512 * 1024
_Alignas(64) char src[BUF_SIZE];
_Alignas(64) char dest[BUF_SIZE];
static void __run_benchmark(unsigned runs, unsigned run_iterations,
void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz);
#define run_benchmark(runs, run_iterations, fn, dest, src, sz) \
do{\
printf("Benchmarking " #fn "\n");\
__run_benchmark(runs, run_iterations, fn, dest, src, sz);\
}while(0)
int main(void){
int fd = open("/dev/urandom", O_RDONLY);
read(fd, src, sizeof src);
run_benchmark(20, ITERATIONS, avx_memcpy_forward_lsls, dest, src, BUF_SIZE);
}
static inline void benchmark_copy_function(unsigned iterations, void *(*fn)(void *, const void *, size_t),
void *restrict dest, const void *restrict src, size_t sz){
while(iterations --> 0){
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
}
}
static void __run_benchmark(unsigned runs, unsigned run_iterations,
void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz){
unsigned current_run = 1;
while(current_run <= runs){
benchmark_copy_function(run_iterations, fn, dest, src, sz);
printf("Run %d finished\n", current_run);
current_run++;
}
}
考虑编译后的 2 次运行 main.c
我。
MSR:
$ sudo rdmsr -p 0 0x1A4
0
Run:
$ taskset -c 0 sudo ../profile.sh ./bin
Performance counter stats for './bin':
10 486 164 071 L1-dcache-loads (12,13%)
10 461 354 384 L1-dcache-load-misses # 99,76% of all L1-dcache hits (12,05%)
10 481 930 413 L1-dcache-stores (12,05%)
10 461 136 686 l1d.replacement (12,12%)
31 466 394 422 l1d_pend_miss.fb_full (12,11%)
211 853 643 294 l1d_pend_miss.pending (12,09%)
1 759 204 317 LLC-loads (12,16%)
31 007 LLC-load-misses # 0,00% of all LL-cache hits (12,16%)
3 154 901 630 LLC-stores (6,19%)
15 867 315 545 l2_rqsts.all_pf (9,22%)
0 sw_prefetch_access.t1_t2 (12,22%)
1 393 306 l2_lines_out.useless_hwpf (12,16%)
3 549 170 919 l2_rqsts.pf_hit (12,09%)
12 356 247 643 l2_rqsts.pf_miss (12,06%)
0 load_hit_pre.sw_pf (12,09%)
3 159 712 695 l2_rqsts.rfo_hit (12,06%)
1 207 642 335 l2_rqsts.rfo_miss (12,02%)
4 366 526 618 l2_rqsts.all_rfo (12,06%)
5 240 013 774 offcore_requests.all_data_rd (12,06%)
19 936 657 118 offcore_requests.all_requests (12,09%)
1 761 660 763 offcore_response.demand_data_rd.any_response (12,12%)
287 044 397 bus-cycles (12,15%)
36 816 767 779 resource_stalls.any (12,15%)
36 553 997 653 resource_stalls.sb (12,15%)
38 035 066 210 uops_retired.stall_cycles (12,12%)
24 766 225 119 uops_executed.stall_cycles (12,09%)
40 478 455 041 uops_issued.stall_cycles (12,05%)
24 497 256 548 cycle_activity.stalls_l1d_miss (12,02%)
12 611 038 018 cycle_activity.stalls_l2_miss (12,09%)
10 228 869 cycle_activity.stalls_l3_miss (12,12%)
24 707 614 483 cycle_activity.stalls_mem_any (12,22%)
24 776 110 104 cycle_activity.stalls_total (12,22%)
48 914 478 241 cycles (12,19%)
12,155774555 seconds time elapsed
11,984577000 seconds user
0,015984000 seconds sys
二.
MSR:
$ sudo rdmsr -p 0 0x1A4
1
Run:
$ taskset -c 0 sudo ../profile.sh ./bin
Performance counter stats for './bin':
10 508 027 832 L1-dcache-loads (12,05%)
10 463 643 206 L1-dcache-load-misses # 99,58% of all L1-dcache hits (12,09%)
10 481 296 605 L1-dcache-stores (12,12%)
10 444 854 468 l1d.replacement (12,15%)
29 287 445 744 l1d_pend_miss.fb_full (12,17%)
205 569 630 707 l1d_pend_miss.pending (12,17%)
5 103 444 329 LLC-loads (12,17%)
33 406 LLC-load-misses # 0,00% of all LL-cache hits (12,17%)
9 567 917 742 LLC-stores (6,08%)
1 157 237 980 l2_rqsts.all_pf (9,12%)
0 sw_prefetch_access.t1_t2 (12,17%)
301 471 l2_lines_out.useless_hwpf (12,17%)
218 528 985 l2_rqsts.pf_hit (12,17%)
938 735 722 l2_rqsts.pf_miss (12,17%)
0 load_hit_pre.sw_pf (12,17%)
4 096 281 l2_rqsts.rfo_hit (12,17%)
4 972 640 931 l2_rqsts.rfo_miss (12,17%)
4 976 006 805 l2_rqsts.all_rfo (12,17%)
5 175 544 191 offcore_requests.all_data_rd (12,17%)
15 772 124 082 offcore_requests.all_requests (12,17%)
5 120 635 892 offcore_response.demand_data_rd.any_response (12,17%)
292 980 395 bus-cycles (12,17%)
37 592 020 151 resource_stalls.any (12,14%)
37 317 091 982 resource_stalls.sb (12,11%)
38 121 826 730 uops_retired.stall_cycles (12,08%)
25 430 699 605 uops_executed.stall_cycles (12,04%)
41 416 190 037 uops_issued.stall_cycles (12,04%)
25 326 579 070 cycle_activity.stalls_l1d_miss (12,04%)
25 019 148 253 cycle_activity.stalls_l2_miss (12,03%)
7 384 770 cycle_activity.stalls_l3_miss (12,03%)
25 442 709 033 cycle_activity.stalls_mem_any (12,03%)
25 406 897 956 cycle_activity.stalls_total (12,03%)
49 877 044 086 cycles (12,03%)
12,231406658 seconds time elapsed
12,226386000 seconds user
0,004000000 seconds sys
我注意到了计数器:
12 611 038 018 cycle_activity.stalls_l2_miss
v/s
25 019 148 253 cycle_activity.stalls_l2_miss
建议正在应用禁用 L2 硬件预取器的 MSR。其他 l2/LLC 相关的东西也有很大不同。 差异在不同的运行中是可重现的。问题是 total time
和周期几乎没有区别:
48 914 478 241 cycles
v/s
49 877 044 086 cycles
12,155774555 seconds time elapsed
v/s
12,231406658 seconds time elapsed
问题:
L2 未命中是否被其他性能限制器隐藏了?
如果是这样,您能否建议查看哪些计数器以了解它?
是的,L2 流媒体在很多时候确实很有帮助。
memcpy 没有任何可隐藏的计算延迟,所以我想它可以让 OoO exec 资源(ROB 大小)处理你从更多 L2 未命中中获得的额外加载延迟,至少在这种情况下你使用适合 L3 的中型工作集 (1MiB) 获取所有 L3 命中,无需预取即可实现 L3 命中。
并且唯一的指令是 load/store(和循环开销),因此 OoO window 包括相当远的需求负载。
IDK 如果 L2 空间预取器和 L1d 预取器在这里有所帮助。
测试这个假设的预测:让你的数组更大,这样你就会得到 L3 未命中,一旦 OoO exec 不足以隐藏,你可能会看到总时间的差异一直到 DRAM 的加载延迟。提前触发硬件预取可以帮助一些。
当硬件预取 可以 跟上您的计算速度时,硬件预取的另一大好处就来了,因此您可以获得 L2 匹配。 (在一个具有中等长度但不是循环携带的依赖链的计算的循环中。)
当 ROB 容量没有其他压力时,就使用可用(单线程)内存带宽而言,需求负载和 OoO exec 可以做很多事情。
另请注意,在 Intel CPU 上,每次缓存未命中都会花费 dependent uops[的后端重播(来自 RS/scheduler),当数据预计到达时,L1d 和 L2 各错过一次。在那之后,显然核心在等待数据从 L3 到达时乐观地发送垃圾邮件。
(见https://chat.whosebug.com/rooms/206639/discussion-on-question-by-beeonrope-are-load-ops-deallocated-from-the-rs-when-th and Are load ops deallocated from the RS when they dispatch, complete or some other time?)
不是缓存未命中加载本身;在这种情况下,它将是商店说明。更具体地说,端口 4 的存储数据 uop。这在这里无关紧要;使用 32 字节存储和 L3 带宽瓶颈意味着我们不接近每个时钟 1 个端口 4 uop。
Yes, the L2 HW prefetcher is very helpful!
For example, find below results on my machine (i7-6700HQ) 运行 tinymembench. The first column of results is with all prefetchers on, the second result column is with the L2 streamer off (but all other prefetchers still on).
This test uses 32 MiB source and destination buffers, which are much larger than the L3 on my machine, so it will be testing mostly misses to DRAM.
==========================================================================
== Memory bandwidth tests ==
== ==
== Note 1: 1MB = 1000000 bytes ==
== Note 2: Results for 'copy' tests show how many bytes can be ==
== copied per second (adding together read and writen ==
== bytes would have provided twice higher numbers) ==
== Note 3: 2-pass copy means that we are using a small temporary buffer ==
== to first fetch data into it, and only then write it to the ==
== destination (source -> L1 cache, L1 cache -> destination) ==
== Note 4: If sample standard deviation exceeds 0.1%, it is shown in ==
== brackets ==
==========================================================================
L2 streamer ON OFF
C copy backwards : 7962.4 MB/s 4430.5 MB/s
C copy backwards (32 byte blocks) : 7993.5 MB/s 4467.0 MB/s
C copy backwards (64 byte blocks) : 7989.9 MB/s 4438.0 MB/s
C copy : 8503.1 MB/s 4466.6 MB/s
C copy prefetched (32 bytes step) : 8729.2 MB/s 4958.4 MB/s
C copy prefetched (64 bytes step) : 8730.7 MB/s 4958.4 MB/s
C 2-pass copy : 6171.2 MB/s 3368.7 MB/s
C 2-pass copy prefetched (32 bytes step) : 6193.1 MB/s 4104.2 MB/s
C 2-pass copy prefetched (64 bytes step) : 6198.8 MB/s 4101.6 MB/s
C fill : 13372.4 MB/s 10610.5 MB/s
C fill (shuffle within 16 byte blocks) : 13379.4 MB/s 10547.5 MB/s
C fill (shuffle within 32 byte blocks) : 13365.8 MB/s 10636.9 MB/s
C fill (shuffle within 64 byte blocks) : 13588.7 MB/s 10588.3 MB/s
-
standard memcpy : 11550.7 MB/s 8216.3 MB/s
standard memset : 23188.7 MB/s 22686.8 MB/s
-
MOVSB copy : 9458.4 MB/s 6523.7 MB/s
MOVSD copy : 9474.5 MB/s 6510.7 MB/s
STOSB fill : 23329.0 MB/s 22901.5 MB/s
SSE2 copy : 9073.1 MB/s 4970.3 MB/s
SSE2 nontemporal copy : 12647.1 MB/s 7492.5 MB/s
SSE2 copy prefetched (32 bytes step) : 9106.0 MB/s 5069.8 MB/s
SSE2 copy prefetched (64 bytes step) : 9113.5 MB/s 5063.1 MB/s
SSE2 nontemporal copy prefetched (32 bytes step) : 11770.8 MB/s 7453.4 MB/s
SSE2 nontemporal copy prefetched (64 bytes step) : 11937.1 MB/s 7712.1 MB/s
SSE2 2-pass copy : 7092.8 MB/s 4355.2 MB/s
SSE2 2-pass copy prefetched (32 bytes step) : 7001.4 MB/s 4585.1 MB/s
SSE2 2-pass copy prefetched (64 bytes step) : 7055.1 MB/s 4557.9 MB/s
SSE2 2-pass nontemporal copy : 5043.2 MB/s 3263.3 MB/s
SSE2 fill : 14087.3 MB/s 10947.1 MB/s
SSE2 nontemporal fill : 33134.5 MB/s 32774.3 MB/s
In these tests having the L2 streamer is never slower and is often nearly twice as fast.
In general, you might notice the following patterns in the results:
- Copies generally seem to be more affected than fills.
- The
standard memset
and STOSB fill
(these boil down to the same thing on this platform) are the least affected, with the prefetched result being only a few % faster than without.
- Standard
memcpy
is probably the only copy here that uses 32-byte AVX instructions, and it is among the least affected of the copies - but prefetching on is still ~40% faster than without.
I also tried turning on and off the other three prefetchers, but they generally had almost no measurable effect for this benchmark.
我在 Whiskey Lake i7-8565U 上分析性能计数器和复制 512 KiB 数据的时间(是 L2 缓存大小的两倍),遇到了一些关于 L2 的误解硬件预取器的工作。
在 Intel Manual Vol.4 MSR 中有 MSR 0x1A4
的位 0 用于控制 L2 硬件预取器(1 禁用)。
考虑以下基准:
memcopy.h
:
void *avx_memcpy_forward_lsls(void *restrict, const void *restrict, size_t);
memcopy.S
:
avx_memcpy_forward_lsls:
shr rdx, 0x3
xor rcx, rcx
avx_memcpy_forward_loop_lsls:
vmovdqa ymm0, [rsi + 8*rcx]
vmovdqa [rdi + rcx*8], ymm0
vmovdqa ymm1, [rsi + 8*rcx + 0x20]
vmovdqa [rdi + rcx*8 + 0x20], ymm1
add rcx, 0x08
cmp rdx, rcx
ja avx_memcpy_forward_loop_lsls
ret
main.c
:
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include <x86intrin.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include "memcopy.h"
#define ITERATIONS 1000
#define BUF_SIZE 512 * 1024
_Alignas(64) char src[BUF_SIZE];
_Alignas(64) char dest[BUF_SIZE];
static void __run_benchmark(unsigned runs, unsigned run_iterations,
void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz);
#define run_benchmark(runs, run_iterations, fn, dest, src, sz) \
do{\
printf("Benchmarking " #fn "\n");\
__run_benchmark(runs, run_iterations, fn, dest, src, sz);\
}while(0)
int main(void){
int fd = open("/dev/urandom", O_RDONLY);
read(fd, src, sizeof src);
run_benchmark(20, ITERATIONS, avx_memcpy_forward_lsls, dest, src, BUF_SIZE);
}
static inline void benchmark_copy_function(unsigned iterations, void *(*fn)(void *, const void *, size_t),
void *restrict dest, const void *restrict src, size_t sz){
while(iterations --> 0){
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
fn(dest, src, sz);
}
}
static void __run_benchmark(unsigned runs, unsigned run_iterations,
void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz){
unsigned current_run = 1;
while(current_run <= runs){
benchmark_copy_function(run_iterations, fn, dest, src, sz);
printf("Run %d finished\n", current_run);
current_run++;
}
}
考虑编译后的 2 次运行 main.c
我。
MSR:
$ sudo rdmsr -p 0 0x1A4
0
Run:
$ taskset -c 0 sudo ../profile.sh ./bin
Performance counter stats for './bin':
10 486 164 071 L1-dcache-loads (12,13%)
10 461 354 384 L1-dcache-load-misses # 99,76% of all L1-dcache hits (12,05%)
10 481 930 413 L1-dcache-stores (12,05%)
10 461 136 686 l1d.replacement (12,12%)
31 466 394 422 l1d_pend_miss.fb_full (12,11%)
211 853 643 294 l1d_pend_miss.pending (12,09%)
1 759 204 317 LLC-loads (12,16%)
31 007 LLC-load-misses # 0,00% of all LL-cache hits (12,16%)
3 154 901 630 LLC-stores (6,19%)
15 867 315 545 l2_rqsts.all_pf (9,22%)
0 sw_prefetch_access.t1_t2 (12,22%)
1 393 306 l2_lines_out.useless_hwpf (12,16%)
3 549 170 919 l2_rqsts.pf_hit (12,09%)
12 356 247 643 l2_rqsts.pf_miss (12,06%)
0 load_hit_pre.sw_pf (12,09%)
3 159 712 695 l2_rqsts.rfo_hit (12,06%)
1 207 642 335 l2_rqsts.rfo_miss (12,02%)
4 366 526 618 l2_rqsts.all_rfo (12,06%)
5 240 013 774 offcore_requests.all_data_rd (12,06%)
19 936 657 118 offcore_requests.all_requests (12,09%)
1 761 660 763 offcore_response.demand_data_rd.any_response (12,12%)
287 044 397 bus-cycles (12,15%)
36 816 767 779 resource_stalls.any (12,15%)
36 553 997 653 resource_stalls.sb (12,15%)
38 035 066 210 uops_retired.stall_cycles (12,12%)
24 766 225 119 uops_executed.stall_cycles (12,09%)
40 478 455 041 uops_issued.stall_cycles (12,05%)
24 497 256 548 cycle_activity.stalls_l1d_miss (12,02%)
12 611 038 018 cycle_activity.stalls_l2_miss (12,09%)
10 228 869 cycle_activity.stalls_l3_miss (12,12%)
24 707 614 483 cycle_activity.stalls_mem_any (12,22%)
24 776 110 104 cycle_activity.stalls_total (12,22%)
48 914 478 241 cycles (12,19%)
12,155774555 seconds time elapsed
11,984577000 seconds user
0,015984000 seconds sys
二.
MSR:
$ sudo rdmsr -p 0 0x1A4
1
Run:
$ taskset -c 0 sudo ../profile.sh ./bin
Performance counter stats for './bin':
10 508 027 832 L1-dcache-loads (12,05%)
10 463 643 206 L1-dcache-load-misses # 99,58% of all L1-dcache hits (12,09%)
10 481 296 605 L1-dcache-stores (12,12%)
10 444 854 468 l1d.replacement (12,15%)
29 287 445 744 l1d_pend_miss.fb_full (12,17%)
205 569 630 707 l1d_pend_miss.pending (12,17%)
5 103 444 329 LLC-loads (12,17%)
33 406 LLC-load-misses # 0,00% of all LL-cache hits (12,17%)
9 567 917 742 LLC-stores (6,08%)
1 157 237 980 l2_rqsts.all_pf (9,12%)
0 sw_prefetch_access.t1_t2 (12,17%)
301 471 l2_lines_out.useless_hwpf (12,17%)
218 528 985 l2_rqsts.pf_hit (12,17%)
938 735 722 l2_rqsts.pf_miss (12,17%)
0 load_hit_pre.sw_pf (12,17%)
4 096 281 l2_rqsts.rfo_hit (12,17%)
4 972 640 931 l2_rqsts.rfo_miss (12,17%)
4 976 006 805 l2_rqsts.all_rfo (12,17%)
5 175 544 191 offcore_requests.all_data_rd (12,17%)
15 772 124 082 offcore_requests.all_requests (12,17%)
5 120 635 892 offcore_response.demand_data_rd.any_response (12,17%)
292 980 395 bus-cycles (12,17%)
37 592 020 151 resource_stalls.any (12,14%)
37 317 091 982 resource_stalls.sb (12,11%)
38 121 826 730 uops_retired.stall_cycles (12,08%)
25 430 699 605 uops_executed.stall_cycles (12,04%)
41 416 190 037 uops_issued.stall_cycles (12,04%)
25 326 579 070 cycle_activity.stalls_l1d_miss (12,04%)
25 019 148 253 cycle_activity.stalls_l2_miss (12,03%)
7 384 770 cycle_activity.stalls_l3_miss (12,03%)
25 442 709 033 cycle_activity.stalls_mem_any (12,03%)
25 406 897 956 cycle_activity.stalls_total (12,03%)
49 877 044 086 cycles (12,03%)
12,231406658 seconds time elapsed
12,226386000 seconds user
0,004000000 seconds sys
我注意到了计数器:
12 611 038 018 cycle_activity.stalls_l2_miss
v/s
25 019 148 253 cycle_activity.stalls_l2_miss
建议正在应用禁用 L2 硬件预取器的 MSR。其他 l2/LLC 相关的东西也有很大不同。 差异在不同的运行中是可重现的。问题是 total time
和周期几乎没有区别:
48 914 478 241 cycles
v/s
49 877 044 086 cycles
12,155774555 seconds time elapsed
v/s
12,231406658 seconds time elapsed
问题:
L2 未命中是否被其他性能限制器隐藏了?
如果是这样,您能否建议查看哪些计数器以了解它?
是的,L2 流媒体在很多时候确实很有帮助。
memcpy 没有任何可隐藏的计算延迟,所以我想它可以让 OoO exec 资源(ROB 大小)处理你从更多 L2 未命中中获得的额外加载延迟,至少在这种情况下你使用适合 L3 的中型工作集 (1MiB) 获取所有 L3 命中,无需预取即可实现 L3 命中。
并且唯一的指令是 load/store(和循环开销),因此 OoO window 包括相当远的需求负载。
IDK 如果 L2 空间预取器和 L1d 预取器在这里有所帮助。
测试这个假设的预测:让你的数组更大,这样你就会得到 L3 未命中,一旦 OoO exec 不足以隐藏,你可能会看到总时间的差异一直到 DRAM 的加载延迟。提前触发硬件预取可以帮助一些。
当硬件预取 可以 跟上您的计算速度时,硬件预取的另一大好处就来了,因此您可以获得 L2 匹配。 (在一个具有中等长度但不是循环携带的依赖链的计算的循环中。)
当 ROB 容量没有其他压力时,就使用可用(单线程)内存带宽而言,需求负载和 OoO exec 可以做很多事情。
另请注意,在 Intel CPU 上,每次缓存未命中都会花费 dependent uops[的后端重播(来自 RS/scheduler),当数据预计到达时,L1d 和 L2 各错过一次。在那之后,显然核心在等待数据从 L3 到达时乐观地发送垃圾邮件。
(见https://chat.whosebug.com/rooms/206639/discussion-on-question-by-beeonrope-are-load-ops-deallocated-from-the-rs-when-th and Are load ops deallocated from the RS when they dispatch, complete or some other time?)
不是缓存未命中加载本身;在这种情况下,它将是商店说明。更具体地说,端口 4 的存储数据 uop。这在这里无关紧要;使用 32 字节存储和 L3 带宽瓶颈意味着我们不接近每个时钟 1 个端口 4 uop。
Yes, the L2 HW prefetcher is very helpful!
For example, find below results on my machine (i7-6700HQ) 运行 tinymembench. The first column of results is with all prefetchers on, the second result column is with the L2 streamer off (but all other prefetchers still on).
This test uses 32 MiB source and destination buffers, which are much larger than the L3 on my machine, so it will be testing mostly misses to DRAM.
==========================================================================
== Memory bandwidth tests ==
== ==
== Note 1: 1MB = 1000000 bytes ==
== Note 2: Results for 'copy' tests show how many bytes can be ==
== copied per second (adding together read and writen ==
== bytes would have provided twice higher numbers) ==
== Note 3: 2-pass copy means that we are using a small temporary buffer ==
== to first fetch data into it, and only then write it to the ==
== destination (source -> L1 cache, L1 cache -> destination) ==
== Note 4: If sample standard deviation exceeds 0.1%, it is shown in ==
== brackets ==
==========================================================================
L2 streamer ON OFF
C copy backwards : 7962.4 MB/s 4430.5 MB/s
C copy backwards (32 byte blocks) : 7993.5 MB/s 4467.0 MB/s
C copy backwards (64 byte blocks) : 7989.9 MB/s 4438.0 MB/s
C copy : 8503.1 MB/s 4466.6 MB/s
C copy prefetched (32 bytes step) : 8729.2 MB/s 4958.4 MB/s
C copy prefetched (64 bytes step) : 8730.7 MB/s 4958.4 MB/s
C 2-pass copy : 6171.2 MB/s 3368.7 MB/s
C 2-pass copy prefetched (32 bytes step) : 6193.1 MB/s 4104.2 MB/s
C 2-pass copy prefetched (64 bytes step) : 6198.8 MB/s 4101.6 MB/s
C fill : 13372.4 MB/s 10610.5 MB/s
C fill (shuffle within 16 byte blocks) : 13379.4 MB/s 10547.5 MB/s
C fill (shuffle within 32 byte blocks) : 13365.8 MB/s 10636.9 MB/s
C fill (shuffle within 64 byte blocks) : 13588.7 MB/s 10588.3 MB/s
-
standard memcpy : 11550.7 MB/s 8216.3 MB/s
standard memset : 23188.7 MB/s 22686.8 MB/s
-
MOVSB copy : 9458.4 MB/s 6523.7 MB/s
MOVSD copy : 9474.5 MB/s 6510.7 MB/s
STOSB fill : 23329.0 MB/s 22901.5 MB/s
SSE2 copy : 9073.1 MB/s 4970.3 MB/s
SSE2 nontemporal copy : 12647.1 MB/s 7492.5 MB/s
SSE2 copy prefetched (32 bytes step) : 9106.0 MB/s 5069.8 MB/s
SSE2 copy prefetched (64 bytes step) : 9113.5 MB/s 5063.1 MB/s
SSE2 nontemporal copy prefetched (32 bytes step) : 11770.8 MB/s 7453.4 MB/s
SSE2 nontemporal copy prefetched (64 bytes step) : 11937.1 MB/s 7712.1 MB/s
SSE2 2-pass copy : 7092.8 MB/s 4355.2 MB/s
SSE2 2-pass copy prefetched (32 bytes step) : 7001.4 MB/s 4585.1 MB/s
SSE2 2-pass copy prefetched (64 bytes step) : 7055.1 MB/s 4557.9 MB/s
SSE2 2-pass nontemporal copy : 5043.2 MB/s 3263.3 MB/s
SSE2 fill : 14087.3 MB/s 10947.1 MB/s
SSE2 nontemporal fill : 33134.5 MB/s 32774.3 MB/s
In these tests having the L2 streamer is never slower and is often nearly twice as fast.
In general, you might notice the following patterns in the results:
- Copies generally seem to be more affected than fills.
- The
standard memset
andSTOSB fill
(these boil down to the same thing on this platform) are the least affected, with the prefetched result being only a few % faster than without. - Standard
memcpy
is probably the only copy here that uses 32-byte AVX instructions, and it is among the least affected of the copies - but prefetching on is still ~40% faster than without.
I also tried turning on and off the other three prefetchers, but they generally had almost no measurable effect for this benchmark.