为什么 memcpy 到栈上内存比堆上内存快得多?
Why is memcpy to on-stack memory much faster than to on-heap memory?
我先创建一个文件如下:
struct event_data
{
uint32_t data_1;
uint32_t data_2;
uint32_t data_3;
uint32_t data_4;
uint32_t data_5;
uint32_t data_6;
uint32_t data_7;
uint32_t data_8;
};
FILE* fp = fopen("C:\test.bin", "w+b");
for(int i=0; i<100000; i++)
{
event_data data;
fwrite(&data, sizeof(event_data), 1, fp);
}
fclose(fp);
之后我尝试使用boost的内存映射文件从文件中读取数据:
clock_t start = clock();
event_data stack_buffer;
event_data* heap_buffer = new event_data();
for(int j=0; j<10000; j++)
{
boost::iostreams::mapped_file mmap("C:\test.bin", boost::iostreams::mapped_file::readonly);
const char* data = mmap.const_data();
for(int i=0; i<100000; i++)
{
const event_data* evt = reinterpret_cast<const event_data*>(data) + i;
// Use memcpy to copy data to buffer, either (1) or (2)
// memcpy(&stack_buffer, evt, sizeof(event_data)); <== (1)
// memcpy(heap_buffer, evt, sizeof(event_data)); <== (2)
}
}
clock_t end = clock();
printf("%f sec\n", (double) (end - start) / CLOCKS_PER_SEC);
如果我使用 (1),它会打印出 ~ '0.56 秒'。如果我使用 (2),它会打印出 ~ '26.6 sec'。为什么结果如此不同?
我写了两个函数:
extern int moo1 [100];
extern int moo2 [100];
void foo1 ()
{
memcpy(moo2, moo1, sizeof(moo1));
moo2[2] = moo2[77] + moo2[14];
}
void foo2 ()
{
int moo3[100];
memcpy(moo3, moo1, sizeof(moo1));
moo3[2] = moo3[77] + moo3[14];
}
这是 gcc 将它们编译成的内容:
foo1
_Z4foo1v:
.LFB5:
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
.seh_endprologue
movq .refptr.moo1(%rip), %rsi
movq .refptr.moo2(%rip), %rax
movq (%rsi), %rdx
leaq 8(%rax), %rdi
movq %rax, %rcx
andq $-8, %rdi
movq %rdx, (%rax)
movq 392(%rsi), %rdx
subq %rdi, %rcx
subq %rcx, %rsi
addl 0, %ecx
shrl , %ecx
movq %rdx, 392(%rax)
rep movsq
movl 56(%rax), %edx
addl 308(%rax), %edx
movl %edx, 8(%rax)
popq %rsi
popq %rdi
ret
foo2
_Z4foo2v:
.LFB6:
.seh_endprologue
ret
得出你自己的结论。
我先创建一个文件如下:
struct event_data
{
uint32_t data_1;
uint32_t data_2;
uint32_t data_3;
uint32_t data_4;
uint32_t data_5;
uint32_t data_6;
uint32_t data_7;
uint32_t data_8;
};
FILE* fp = fopen("C:\test.bin", "w+b");
for(int i=0; i<100000; i++)
{
event_data data;
fwrite(&data, sizeof(event_data), 1, fp);
}
fclose(fp);
之后我尝试使用boost的内存映射文件从文件中读取数据:
clock_t start = clock();
event_data stack_buffer;
event_data* heap_buffer = new event_data();
for(int j=0; j<10000; j++)
{
boost::iostreams::mapped_file mmap("C:\test.bin", boost::iostreams::mapped_file::readonly);
const char* data = mmap.const_data();
for(int i=0; i<100000; i++)
{
const event_data* evt = reinterpret_cast<const event_data*>(data) + i;
// Use memcpy to copy data to buffer, either (1) or (2)
// memcpy(&stack_buffer, evt, sizeof(event_data)); <== (1)
// memcpy(heap_buffer, evt, sizeof(event_data)); <== (2)
}
}
clock_t end = clock();
printf("%f sec\n", (double) (end - start) / CLOCKS_PER_SEC);
如果我使用 (1),它会打印出 ~ '0.56 秒'。如果我使用 (2),它会打印出 ~ '26.6 sec'。为什么结果如此不同?
我写了两个函数:
extern int moo1 [100];
extern int moo2 [100];
void foo1 ()
{
memcpy(moo2, moo1, sizeof(moo1));
moo2[2] = moo2[77] + moo2[14];
}
void foo2 ()
{
int moo3[100];
memcpy(moo3, moo1, sizeof(moo1));
moo3[2] = moo3[77] + moo3[14];
}
这是 gcc 将它们编译成的内容:
foo1
_Z4foo1v:
.LFB5:
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
.seh_endprologue
movq .refptr.moo1(%rip), %rsi
movq .refptr.moo2(%rip), %rax
movq (%rsi), %rdx
leaq 8(%rax), %rdi
movq %rax, %rcx
andq $-8, %rdi
movq %rdx, (%rax)
movq 392(%rsi), %rdx
subq %rdi, %rcx
subq %rcx, %rsi
addl 0, %ecx
shrl , %ecx
movq %rdx, 392(%rax)
rep movsq
movl 56(%rax), %edx
addl 308(%rax), %edx
movl %edx, 8(%rax)
popq %rsi
popq %rdi
ret
foo2
_Z4foo2v:
.LFB6:
.seh_endprologue
ret
得出你自己的结论。