为什么只有 1 KiB 或 2 KiB 访问大小的 L2 硬件预取器性能更差?

Why does L2 hardware prefetcher perform worse with only 1 KiB or 2 KiB access size?

我有一个简单的多线程程序,其中线程对给定文件(在内存中)执行随机读取,这些文件在线程之间平均分配。线程从文件中读取缓冲区并设置一个值。这实际上是一个旨在测试内存带宽的程序。这是下面的程序,

#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdbool.h>
#include <ctype.h>
#include <inttypes.h>
#include <pthread.h>
#include <assert.h>
#include <time.h>

#define NS_IN_SECOND 1000000000
uint64_t nano_time(void) {
struct timespec ts;
if( clock_gettime(CLOCK_REALTIME, &ts) == 0)
    return ts.tv_sec * NS_IN_SECOND + ts.tv_nsec;
}

// avx512 test
#include <stdint.h>
void *__memmove_chk_avx512_no_vzeroupper(void *dest, void *src, size_t s);

/**
* To create 4 GB file: This will allocate space on disk
* $ dd < /dev/zero bs=1048576 count=4096 > testfile
*
* 100 GiB
* dd if=/dev/zero of=bigmmaptest bs=1M count=102400
* To clear cache:
* $ sync; echo 1 > /proc/sys/vm/drop_caches
*/
//#define SAMPLE_LATENCY 1
#define BYTES_IN_GB (1024*1024*1024)
// Block sized will be used for read and the same will be used for striding 
// when iterating over a file in mmap. 
#define DEFAULT_BLOCK_SIZE 4096 //8192
#define NANOSECONDS_IN_SECOND 1000000000

const char DEFAULT_NAME[] = "/mnt/tmp/mmaptest";

#define EXIT_MSG(...)           \
    do {                        \
        printf(__VA_ARGS__);   \
        _exit(-1);              \
    } while (0)


uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
                    off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize, char* buf,
                    off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *buf,
    char optype, off_t *offsets, uint64_t *begin, uint64_t *end);

uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize, 
                    off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
                     off_t *offsets, uint64_t *begin, uint64_t *end);
uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize, 
        char optype, off_t *offsets, uint64_t *begin, uint64_t *end);
        
size_t get_filesize(const char* filename);
void print_help_message(const char *progname);
char*    map_buffer(int fd, size_t size);
void    *run_tests(void *);

static int silent = 0;

typedef struct {
    int tid;
    int fd;
    char *mapped_buffer;
    int read_mmap;
    int read_syscall;
    int write_mmap;
    int write_syscall;
    off_t *offsets;
    size_t block_size;
    size_t chunk_size;
    int retval;
    uint64_t start_time;
    uint64_t end_time;
} threadargs_t;


size_t filesize;
int main(int argc, char **argv) {
    char *fname = (char*) DEFAULT_NAME;
    char *mapped_buffer = NULL;
    int c, fd, i, flags = O_RDWR, numthreads = 1, ret, option_index;
    static int randomaccess = 0,
        read_mmap = 0, read_syscall = 0,
        write_mmap = 0, write_syscall = 0,
        mixed_mmap = 0, write_tr = 0;
    off_t *offsets = 0;
    size_t block_size = DEFAULT_BLOCK_SIZE, numblocks,
     new_file_size = 0;
     uint64_t min_start_time, max_end_time = 0, retval;
     // permissions
    uint64_t mode = S_IRWXU | S_IRWXG;

    pthread_t *threads;
    threadargs_t *threadargs;

    static struct option long_options[] = 
    {
        // Options set a flag
        {"randomaccess", no_argument, &randomaccess, 1},
        {"readmmap", no_argument, &read_mmap, 1},
        {"readsyscall", no_argument, &read_syscall, 1},
        {"silent", no_argument, &silent, 1},
        {"writemmap", no_argument, &write_mmap, 1},
        {"writesyscall", no_argument, &write_syscall, 1},
        {"mixedmmap", no_argument, &mixed_mmap, 1},
        // Options take an argument
        {"block", required_argument, 0, 'b'},
        {"file", required_argument, 0, 'f'},
        {"help", no_argument, 0, 'h'},
        {"size", no_argument, 0, 's'},
        {"threads", required_argument, 0, 't'},
        {"writethreads", no_argument, 0, 'w'},
        {0, 0, 0, 0}
    };

    //read operations
    while(1) {
        c = getopt_long(argc, argv, "b:f:h:s:t:w:",
                long_options, &option_index);

        // is end of the option
        if (c == -1)
            break;
        
        switch(c)
        {
            case 0:
                break;
            case 'b':
                block_size = atoi(optarg);
                break;
            case 'f':
                fname = optarg;
                break;
            case 'h':
                print_help_message(argv[0]);
                _exit(0);
            case 's':
                new_file_size = (size_t)(atoi(optarg)) * BYTES_IN_GB;
                break;
            case 't':
                numthreads = (int) (atoi(optarg));
                break;
            case 'w':
                write_tr = atoi(optarg);
                break;
            default:
                break;
        }
    }

    if(!silent){
        printf("PID: %d\n", getpid());
        printf("Using file %s \n", fname);
    }
    if ((filesize = get_filesize(fname)) == -1) {
        if (read_mmap || read_syscall) {
            printf("Cannot obtain file size for %s: %s"
                   "File must exist prior to running read tests.\n",
                   fname, strerror(errno));
            _exit(-1);
        }
        else
            filesize = new_file_size;
    }

    fd = open((const char*)fname, flags, mode);
    if(fd <0) {
        printf("Clould not open/create file %s: %s\n",
            fname, strerror(errno));
            _exit(-1);
    }

    if(block_size < 0 || block_size > filesize){
        printf("Invalid block size: %zu for file of size "
        "%zu. Block size must be greater than 0 and no"
        "greater than the file size.\n",
        block_size, filesize);
        _exit(-1);
    }

    /* 
    * Generate random block number for random file access.
    * Sequential for sequential access
    */
   numblocks = filesize/block_size;
   if(filesize % block_size > 0)
        numblocks++;

    offsets = (off_t *) malloc(numblocks * sizeof(off_t));
    if(offsets == 0){
        printf("Failed to allocate memory: %s\n", strerror(errno));
        _exit(-1);
    }
    for (uint64_t i = 0; i < numblocks; i++)
        if(randomaccess)
            offsets[i] = ((int)random() % numblocks) * block_size;
        else
            offsets[i] = i*block_size;
    if (numblocks % numthreads != 0)
        EXIT_MSG("We have %" PRIu64 " blocks and %d threads. "
                "Threads must evenly divide blocks. "
                "Please fix the args.\n",
                (uint_least64_t)numblocks, numthreads);

    if( read_mmap || write_mmap || mixed_mmap)
        assert((mapped_buffer = map_buffer(fd, filesize)) != NULL);

    threads = (pthread_t*)malloc(numthreads * sizeof(pthread_t));
    threadargs = 
            (threadargs_t*)malloc(numthreads * sizeof(threadargs_t));
    
    if (threads == NULL || threadargs == NULL)
        EXIT_MSG("Could not allocate thread array for %d threads.\n", numthreads);
    
    for (i = 0; i < numthreads; i++) {
        if(mixed_mmap){
            if (i < write_tr) {
                write_mmap = 1;
            } else {
                read_mmap = 1;
            }
        }
        threadargs[i].fd = fd;
        threadargs[i].tid = i;
        threadargs[i].block_size = block_size;
        threadargs[i].chunk_size = filesize/numthreads;
        threadargs[i].mapped_buffer = mapped_buffer;
        threadargs[i].offsets = &offsets[numblocks/numthreads * i];
        threadargs[i].read_mmap = read_mmap;
        threadargs[i].read_syscall = read_syscall;
        threadargs[i].write_mmap = write_mmap;
        threadargs[i].write_syscall = write_syscall;
        int ret = pthread_create(&threads[i], NULL, run_tests, &threadargs[i]);
        if (ret!=0)
            EXIT_MSG("pthread_create for %dth thread failed: %s\n",
                i, strerror(errno));
    }

    for (i = 0; i< numthreads; i++){
        ret = pthread_join(threads[i], NULL);
        if (ret !=0)
            EXIT_MSG("Thread %d failed in join: %s\n", 
            i, strerror(errno));
    }


    // for mixed mode determine read and write aggregate b/w.
    if(mixed_mmap) {
        // Write b/w
        min_start_time = threadargs[0].start_time;
        max_end_time = 0;
        // Since tid 0 to write_tr-1 did writes, find it's min and max.
        for(i=0; i < write_tr; i++){
            min_start_time = (threadargs[i].start_time < min_start_time)?
                threadargs[i].start_time:min_start_time;
            max_end_time = (threadargs[i].end_time > max_end_time)?
                threadargs[i].end_time:max_end_time;
        }
        printf("Write: %.2f\n", 
            (double)write_tr*(filesize/numthreads)/(double)(max_end_time-min_start_time)
            * NANOSECONDS_IN_SECOND / BYTES_IN_GB);
        
        // Read b/w
        min_start_time = threadargs[write_tr].start_time;
        max_end_time = 0;
        for(i=write_tr; i < numthreads; i++){
            min_start_time = (threadargs[i].start_time < min_start_time)?
                threadargs[i].start_time:min_start_time;
            max_end_time = (threadargs[i].end_time > max_end_time)?
                threadargs[i].end_time:max_end_time;
       } 
        printf("Read: %.2f\n", 
            (double)(numthreads-write_tr)*(filesize/numthreads)/(double)(max_end_time-min_start_time)
            * NANOSECONDS_IN_SECOND / BYTES_IN_GB);
    }

    /**
     * For total run time. Find the smallest start time
     * and largest end time across all threads.
     */
    min_start_time = threadargs[0].start_time;
    max_end_time = 0;
    for (i=0; i< numthreads; i++){
        min_start_time = (threadargs[i].start_time < min_start_time)?
            threadargs[i].start_time:min_start_time;
        max_end_time = (threadargs[i].end_time > max_end_time)?
            threadargs[i].end_time:max_end_time;
    }

    printf("%.2f\n", 
            (double)filesize/(double)(max_end_time-min_start_time)
            * NANOSECONDS_IN_SECOND / BYTES_IN_GB);
    

    munmap(mapped_buffer, filesize);
    close(fd);

}

void * run_tests(void *args) {
    uint64_t retval;
    threadargs_t t = *(threadargs_t*)args;

    if(t.read_mmap) {
        if(!silent)
            printf("Running read mmap test:\n");
        retval = read_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size, 
                    t.mapped_buffer, t.offsets,
                    &((threadargs_t*)args)->start_time,
                    &((threadargs_t*)args)->end_time);
    }
    else if(t.read_syscall) {
        if(!silent)
            printf("Running read syscall test:\n");
        retval = read_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size, 
                    t.offsets,
                    &((threadargs_t*)args)->start_time,
                    &((threadargs_t*)args)->end_time);
    }
    else if(t.write_mmap) {
        if(!silent)
            printf("Running write mmap test:\n");
        retval = write_mmap_test(t.fd, t.tid, t.block_size, t.chunk_size, 
                    t.mapped_buffer, t.offsets,
                    &((threadargs_t*)args)->start_time,
                    &((threadargs_t*)args)->end_time);
    }
    else if(t.write_syscall) {
        if(!silent)
            printf("Running write syscall test:\n");
        retval = write_syscall_test(t.fd, t.tid, t.block_size, t.chunk_size, 
                    t.offsets,
                    &((threadargs_t*)args)->start_time,
                    &((threadargs_t*)args)->end_time);
    }
    return (void*) 0;
}

#define READ 1
#define WRITE 2

/**
 ********* SYSCALL section
 */
uint64_t read_syscall_test(int fd, int tid, size_t block_size, size_t filesize, 
                off_t *offsets, uint64_t *begin, uint64_t *end) {
            return syscall_test(fd, tid, block_size, filesize, READ, offsets,
                        begin, end);
}

uint64_t write_syscall_test(int fd, int tid, size_t block_size, size_t filesize,
                off_t *offsets, uint64_t *begin, uint64_t *end) {
            return syscall_test(fd, tid, block_size, filesize, WRITE, offsets,
                        begin, end);
}

uint64_t syscall_test(int fd, int tid, size_t block_size, size_t filesize, 
        char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {

    bool done = false;
    char * buffer = NULL;
    int i = 0;
    size_t total_bytes_transferred = 0;
    uint64_t begin_time, end_time, ret_token = 0;

    buffer = (char*)malloc(block_size);
    if(buffer == NULL) {
        printf("Failed to allocate memory: %s\n", strerror(errno));
        return -1;
    }

    memset((void*)buffer, 0, block_size);

    begin_time= nano_time();

    while(!done) {
        size_t bytes_transferred = 0;

        if(optype == READ)
            bytes_transferred = pread(fd, buffer, block_size, offsets[i++]);
        else if (optype == WRITE)
            bytes_transferred = pwrite(fd, buffer, block_size, offsets[i++]);
        if (bytes_transferred == 0)
            done = true;
        else if(bytes_transferred == -1){
            printf("Failed to IO: %s\n", strerror(errno));
            return -1;
        }
        else {
            total_bytes_transferred += bytes_transferred;

            if (optype == WRITE && total_bytes_transferred == filesize)
                done = true;
            
            // Do random operation
            ret_token += buffer[0];
        }
        if (i*block_size >= filesize)
            done = true;
    }

    end_time = nano_time();

    if(!silent){
        printf("%s: %" PRIu64 " bytes transferred in %" PRIu64 ""
        " ns.\n", (optype == READ)?"read-syscall":"write-syscall",
        (uint_least64_t)total_bytes_transferred, (end_time-begin_time));
        // Throughput in GB/s
        printf("(tid %d) %.2f\n", tid,
            (double)filesize/(double)(end_time-begin_time)
            * NANOSECONDS_IN_SECOND / BYTES_IN_GB);
    }
    
    *begin = begin_time;
    *end = end_time;
    return ret_token;
}

/**
 * MMAP tests
 */

uint64_t read_mmap_test(int fd, int tid, size_t block_size, size_t filesize, 
            char *buf, off_t *offsets, uint64_t *begin, uint64_t *end) {
    return mmap_test(fd, tid, block_size, filesize, buf, READ, offsets, begin, end);
}

uint64_t write_mmap_test(int fd, int tid, size_t block_size, size_t filesize, 
            char *buf, off_t *offsets, uint64_t *begin, uint64_t *end){
    return mmap_test(fd, tid, block_size, filesize, buf, WRITE, offsets, begin, end);
}
// Add memory addr
#if SAMPLE_LATENCY
#define BEGIN_LAT_SAMPLE                                 \
    if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0)   \
        lat_begin_time = nano_time();

#define END_LAT_SAMPLE                                                  \
    if (num_samples < MAX_LAT_SAMPLES && i%LAT_SAMPL_INTERVAL == 0) {                \
    lat_end_time = nano_time();                                         \
    latency_samples[i/LAT_SAMPL_INTERVAL % MAX_LAT_SAMPLES] =           \
        lat_end_time - lat_begin_time;                                  \
    num_samples++;                                                      \
    }

#define MAX_LAT_SAMPLES 50
//#define LAT_SAMPL_INTERVAL (1000*1048576)
#define LAT_SAMPL_INTERVAL block_size

#else

#define BEGIN_LAT_SAMPLE ;
#define END_LAT_SAMPLE

#endif

uint64_t mmap_test(int fd, int tid, size_t block_size, size_t filesize, char *mapped_buffer, 
                char optype, off_t *offsets, uint64_t *begin, uint64_t *end) {
    
    bool done = false;
    char *buffer = NULL;
    uint64_t i, j, numblocks, ret;
    uint64_t begin_time, end_time, ret_token = 0;

#if SAMPLE_LATENCY
    uint64_t lat_begin_time, lat_end_time;
    size_t latency_samples[MAX_LAT_SAMPLES];
    int num_samples = 0;

    memset((void*)latency_samples, 0, sizeof(latency_samples));
#endif

    buffer = (char*)malloc(block_size);
    if(buffer == NULL) {
        printf("Failed to allocate memory: %s\n", strerror(errno));
        return -1;
    }
    memset((void*)buffer, 1, block_size);

    begin_time = nano_time();
    for(i=0; i<filesize; i+=block_size){
        off_t offset = offsets[i/block_size];
        BEGIN_LAT_SAMPLE;
        if(optype == READ) {
            //__memmove_chk_avx512_no_vzeroupper(buffer, &mapped_buffer[offset], block_size);
            memcpy(buffer, &mapped_buffer[offset], block_size);
            ret_token += buffer[0];
        }
        else if (optype == WRITE) {
            //__memmove_chk_avx512_no_vzeroupper(&mapped_buffer[offset], buffer, block_size);
            memcpy(&mapped_buffer[offset], buffer, block_size);
            ret_token += mapped_buffer[i];
        }
        END_LAT_SAMPLE;
    }

    end_time = nano_time();

    if(!silent) {
        printf("%s: %" PRIu64 " bytes read in %" PRIu64 " ns.\n",
        (optype==READ)?"readmap":"writemap",
        (uint_least64_t)filesize, (end_time-begin_time));
    
        // print GB/s
        printf("(tid %d) %.2f\n", tid,
            (double)filesize/(double)(end_time-begin_time)
            * NANOSECONDS_IN_SECOND / BYTES_IN_GB);
    }

    *begin = begin_time;
    *end = end_time;

#if SAMPLE_LATENCY
    printf("\nSample latency for %ld byte block:\n", block_size);
    for (i = 0; i < MAX_LAT_SAMPLES; i++)
        printf("\t%ld: %ld\n", i, latency_samples[i]);

#endif
    return ret_token;
}

char* map_buffer(int fd, size_t size) {
    char *mapped_buffer = NULL;

//    Populate
      mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
                              MAP_PRIVATE | MAP_POPULATE, fd, 0);
//    Shared
//    mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
//                            MAP_SHARED, fd, 0);
//    Anon test
//    mapped_buffer = (char*)mmap(NULL, size, PROT_READ | PROT_WRITE,
//                            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if(mapped_buffer == MAP_FAILED)
        EXIT_MSG("Failed to mmap file of size %zu: %s\n",
            size, strerror(errno));
  
//    Might also need to gurantee page aligned - posix_memalign() 
//    int mret = madvise(mapped_buffer, filesize, MADV_HUGEPAGE);
//     if(mret!=0) {
//        fprintf(stderr, "failed madvise: %s\n", strerror(errno));
//    } 
    return mapped_buffer;
}

size_t get_filesize(const char* filename){
    int retval;

    struct stat st;
    retval = stat(filename, &st);
    if(retval)
        return -1;
    else 
        return st.st_size;
}

void print_help_message(const char *progname) {

    /* take only the last portion of the path */
    const char *basename = strrchr(progname, '/');
    basename = basename ? basename + 1 : progname;

    printf("usage: %s [OPTION]\n", basename);
    printf("  -h, --help\n"
           "     Print this help and exit.\n");
    printf("  -b, --block[=BLOCKSIZE]\n"
           "     Block size used for read system calls.\n"
           "     For mmap tests, the size of the stride when iterating\n"
           "     over the file.\n"
           "     Defaults to %d.\n", DEFAULT_BLOCK_SIZE);
    printf("  -f, --file[=FILENAME]\n"
           "     Perform all tests on this file (defaults to %s).\n",
           DEFAULT_NAME);
    printf("  --readsyscall\n"
           "     Perform a read test using system calls.\n");
    printf("  --readmmap\n"
           "     Perform a read test using mmap.\n");
    printf("  --writesyscall\n"
           "     Perform a write test using system calls.\n");
    printf("  --writemmap\n"
           "     Perform a write test using mmap.\n");
    printf(" --randomaccess\n"
           "    Perform random access.\n");
    printf(" --threads\n"
           "    Number of threads to use. Defaults to one.\n");
    printf(" --mixedmmap\n"
           "    Perfom read and write concurrently at different offsets\n");
    printf(" -w, -writethreads[=0]\n"
           "    Number of threads that should perform write\n");
}

编译:

$ gcc testm.c -o testm -lpthread -static -O2 -fno-builtin-memcpy

运行 程序的命令:

$ dd if=/dev/zero of=bigmmaptest bs=1M count=25600 # 25 GiB file
$ ./testm -b 1024 -f bigmmaptest --threads 16 --randomaccess --readmmap

我在 32 核 Xeon 5218 第二代 L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22

当 memcpy 大小为 1 KiB 时,我得到 21.7 GB/s 但当大小为 256B 时,我得到 26.68 GB/s 和 34.8 GB/s 当大小为 4 KiB 时。为什么中间有水滴? 我观察到与 256B 和 4 KiB 相比,2 KiB 的性能也很差。

更有趣的是,当我禁用 L2 硬件预取器并且没有任何其他更改时,我的带宽会自动增加 1 KiB 和 2 KiB。没有预取 2 KiB memcpy 给出 34.8 GB/s。所有这些都是聚合带宽。

使用 perf,我确实测量了 L2 加载存储未命中,但结果并没有发生太大变化。 8线程及以下也看不到这种效果。

我在 linux 5.0.4。我正在使用 glibC memcpy (gcc 7.5.0),即使使用 -O2 我也观察到上述怪癖。其中 1 KiB 访问大小提供 18.76 GiB/s 与 L2 预取和没有我得到 30.32 GiB/s。作为比较,256 B 访问大小提供 24.7 GiB/s 预取和 24.8 GiB/s 不预取。显然,性能下降是由于预取器造成的 L2 缓存污染,因为在较小的线程数中观察不到这一点。我正在考虑 SMT 是否可能是污染增加的原因,但我在 16 个物理内核上的 16 个线程上观察到明显的影响。

浏览 glibc memcpy 代码,我可以看到任何小于 4 KiB 大小的访问都使用 AVX 256 指令,因此那里没有任何变化。

较小的 256B 大小未从 L2 流送器中看到丢弃可能是由于缓存未命中序列太短而无法激活流送器并浪费带宽(以及 LFB 和 L2 <-> L3 超级队列中的插槽)关于无用的请求。

对于对齐的 4k,在同一页中没有您获取的字节,因此 L2 预取器非常有用,或者至少无害。 (当 运行 memcpy 时,后续行的需求负载很快出现,所以我猜测速度大致相同 with/without 启用硬件预取,除非硬件预取有助于在等待时开始新的 4k 块上一个结束。)

L2 只看到物理地址,据我所知,它不会尝试跨 4k 边界预取。 (即使它在同一个 2M 大页面中,因为它也不知道。)英特尔提到的“下一页预取器”在 Ivy Bridge 中是新的,AFAIK 只是一个 TLB 预取,而不是数据。

因此,使用 aligned 4k memcpy,硬件预取会在您实际要读取的数据末尾自动停止,不会浪费任何带宽。 由于 mmap 为您提供页面对齐的内存,因此这些 4k memcopies 来自单个源页面。

(目标是无关紧要的,因为它可能在 L1d 缓存中保持热状态,可能偶尔会被驱逐到 L2,并且在 memcpy 之后从它重新加载可以来自存储转发,甚至不必等待 memcpy 的存储到致力于 L1d。)

预测: 如果您的较小的 memcpy 源开始部分进入 4k 页面,但仍然在 4k 页面的末尾结束,您可能会看到与预取类似的行为禁用。例如生成一个随机页码,从 3072 字节开始,复制 1 KiB。所以你所有的 1 KiB 副本都来自页面的末尾,而不是中间。

(你仍然有更多的 dTLB 未命中每个字节 memcpyed,因为每个 TLB 条目只覆盖你实际读取的数据的 1 K。你有没有使用 MAP_POPULATE 所以你不应该假设您有足够的 RAM,在定时区域中看到页面错误。)

L1d KiB /L2 MiB /L3 MiB -- 512 / 16 / 22

这些是总计,但 L1d 和 L2 是每个内核专用的!每个核心有 32kiB L1d 和 1MiB L2 ,因为这是 Cascade Lake,布局与 Skylake-X 相同。


顺便说一句,我会考虑在计时循环中使用像 xorshift+ 或 xorshift* 这样的快速 PRNG;这很容易随机到足以击败预取;即使是一个简单的 LFSR 或什至具有 2 次方模的 LCG 也可以做到这一点(而且非常便宜,只需一个 imul 和 add)。如果您真的只想隔离 memcpy 内存访问,它可以避免从另一个数组读取偏移量。不过可能没什么区别。周期等于 space 的非常简单的 PRNG 的一个优点是您试图覆盖(如 LCG)是您不会两次生成相同的地址,从而为您提供块的随机排列。但是有了足够大的内存块,即使没有难以实现的 属性.

,即使从 L3 随机缓存命中也不太可能

您当前的偏移数组没问题。 (我没有仔细查看代码,所以我只是假设没有错误。)