为什么 memcpy 到栈上内存比堆上内存快得多?

Why is memcpy to on-stack memory much faster than to on-heap memory?

我先创建一个文件如下:

struct event_data
{
    uint32_t data_1;
    uint32_t data_2;
    uint32_t data_3;
    uint32_t data_4;
    uint32_t data_5;
    uint32_t data_6;
    uint32_t data_7;
    uint32_t data_8;
};

FILE* fp = fopen("C:\test.bin", "w+b");
for(int i=0; i<100000; i++)
{
    event_data data;
    fwrite(&data, sizeof(event_data), 1, fp);
}
fclose(fp);

之后我尝试使用boost的内存映射文件从文件中读取数据:

clock_t start = clock();

event_data stack_buffer;
event_data* heap_buffer = new event_data();

for(int j=0; j<10000; j++)
{
    boost::iostreams::mapped_file mmap("C:\test.bin", boost::iostreams::mapped_file::readonly);
    const char* data = mmap.const_data();
    for(int i=0; i<100000; i++)
    {
        const event_data* evt = reinterpret_cast<const event_data*>(data) + i;
        // Use memcpy to copy data to buffer, either (1) or (2)
        // memcpy(&stack_buffer, evt, sizeof(event_data)); <== (1)
        // memcpy(heap_buffer, evt, sizeof(event_data)); <== (2)
    }
}

clock_t end = clock();

printf("%f sec\n", (double) (end - start) / CLOCKS_PER_SEC);

如果我使用 (1),它会打印出 ~ '0.56 秒'。如果我使用 (2),它会打印出 ~ '26.6 sec'。为什么结果如此不同?

我写了两个函数:

extern int moo1 [100];
extern int moo2 [100];


void foo1 ()
{
    memcpy(moo2, moo1, sizeof(moo1));
    moo2[2] = moo2[77] + moo2[14];
}

void foo2 ()
{
    int moo3[100];

    memcpy(moo3, moo1, sizeof(moo1));
    moo3[2] = moo3[77] + moo3[14];
}

这是 gcc 将它们编译成的内容:

foo1

_Z4foo1v:
.LFB5:
    pushq   %rdi
    .seh_pushreg    %rdi
    pushq   %rsi
    .seh_pushreg    %rsi
    .seh_endprologue
    movq    .refptr.moo1(%rip), %rsi
    movq    .refptr.moo2(%rip), %rax
    movq    (%rsi), %rdx
    leaq    8(%rax), %rdi
    movq    %rax, %rcx
    andq    $-8, %rdi
    movq    %rdx, (%rax)
    movq    392(%rsi), %rdx
    subq    %rdi, %rcx
    subq    %rcx, %rsi
    addl    0, %ecx
    shrl    , %ecx
    movq    %rdx, 392(%rax)
    rep movsq
    movl    56(%rax), %edx
    addl    308(%rax), %edx
    movl    %edx, 8(%rax)
    popq    %rsi
    popq    %rdi
    ret

foo2

_Z4foo2v:
.LFB6:
    .seh_endprologue
    ret

得出你自己的结论。