CPU 与使用 malloc 的墙时间测量值之间存在奇怪差异
Odd discrepancy in CPU vs Wall time measurements with malloc
答:用户和内核cpu时间是分开测的,小心!
编辑:pTimes 现在在每个基准之间重置为零,但结果变得更奇怪了!
为了摆弄我自己的自定义内存管理方案,我为 Visual Community 2019 中现有的 malloc() 做了一个简单的基准测试,Windows10。出于兴趣,我对 CPU 时间和墙时间,我通过在许多块中分配一大块内存来测试 malloc,然后单独释放每个块而不使用它们。看这里:
void malloc_distr256(int nMemsize) {
long long pFreeList[256];
for (int i = 0; i < 256; ++i) pFreeList[i] = malloc(nMemsize >> 8);
for (int i = 0; i < 256; ++i) free((void*)pFreeList[i]);
}
void malloc_distr64(int nMemsize) {
long long pFreeList[64];
for (int i = 0; i < 64; ++i) pFreeList[i] = malloc(nMemsize >> 6);
for (int i = 0; i < 64; ++i) free((void*)pFreeList[i]);
}
void malloc_distr0(int nMemsize) {
void* pMem = malloc(nMemsize);
free(pMem);
}
我使用以下代码对这些函数进行了基准测试——“BenchTimes”只是一个包含双 CPU/wall 次的结构:
inline double cputime() {
FILETIME lpCreationTime;
FILETIME lpExitTime;
FILETIME lpKernelTime;
FILETIME lpUserTime;
if (GetProcessTimes(GetCurrentProcess(), &lpCreationTime, &lpExitTime, &lpKernelTime, &lpUserTime)) {
double dUnits = (double)(lpUserTime.dwLowDateTime | (long long)lpUserTime.dwHighDateTime << 32);
return dUnits * 0.1;
}
else return 0xFFF0000000000000;
}
inline double walltime() {
LARGE_INTEGER lnFreq, lnTime;
if (QueryPerformanceFrequency(&lnFreq)) if (QueryPerformanceCounter(&lnTime))
return 1000000.0 * (double)lnTime.QuadPart / (double)lnFreq.QuadPart;
//multiply by 1,000,000 to convert seconds to microseconds
//because the cpu time measurer I had in microseconds as well
return 0.0;
}
void bench(void (pfnFunc)(int), int nMemsize, int nIters, int nReps, BenchTimes* pTimes) {
pTimes->dCpuTime = 0.0;
pTimes->dWallTime = 0.0;
for (volatile int r = 0; r < nReps; ++r) {
double dCpuStart = cputime();
double dWallStart = walltime();
for (volatile int i = 0; i < nIters; ++i) pfnFunc(nMemsize);
double dCpuEnd = cputime();
double dWallEnd = walltime();
double dCpuDiff = dCpuEnd - dCpuStart;
double dWallDiff = dWallEnd - dWallStart;
pTimes->dCpuTime += dCpuDiff;
pTimes->dWallTime += dWallDiff;
}
}
这些是在我的电脑 (i5-9400f) 上测得的时间,以秒为单位。
我很好奇性能和墙时间与 CPU 时间比较的巨大差异!
运行 的代码在这里:
BenchTimes sTimes;
bench(malloc_distr256, 1 << 20, 100, 1000, &sTimes);
fprintf(stdout, "Malloc alloc/free bench allocated %lf megabytes, distributed over 256 chunks\n", (double)(1 << 20) / 1000000);
fprintf(stdout, "Malloc alloc/free bench returned:\nWalltime - total: %lf\nCPU Time - total: %lf\n", sTimes.dWallTime / 1000000, sTimes.dCpuTime / 1000000);
bench(malloc_distr64, 1 << 20, 100, 1000, &sTimes);
fprintf(stdout, "\nMalloc alloc/free bench allocated %lf megabytes, distributed over 64 chunks\n", (double)(1 << 20) / 1000000);
fprintf(stdout, "\nMalloc alloc/free bench returned:\nWalltime - total: %lf\nCPU Time - total: %lf\n", sTimes.dWallTime / 1000000, sTimes.dCpuTime / 1000000);
bench(malloc_distr0, 1 << 20, 100, 1000, &sTimes);
fprintf(stdout, "\nMalloc alloc/free bench allocated %lf megabytes, distributed over no chunks\n", (double)(1 << 20) / 1000000);
fprintf(stdout, "\nMalloc alloc/free bench returned:\nWalltime - total: %lf\nCPU Time - total: %lf\n", sTimes.dWallTime / 1000000, sTimes.dCpuTime / 1000000);
system("pause");
malloc
是通过 HeapAlloc
实现的,它在名为 RtlAllocateHeap
.
的系统函数中实现
该函数管理堆。它通过 VirtualAlloc[Ex]
或其等价物分配系统内存页面,并在这些页面内提供较小的分配。
对于较大的分配,VirtualAlloc[Ex]
等价物在每次分配时都会被调用,对于较小的分配,它偶尔会被调用。
VirtualAlloc[Ex]
是使用内核调用 NtAllocateVirtualMemory
实现的。花在里面的时间大部分都不算在lpUserTime
.
另一方面,QueryPerformanceCounter
是诚实的总时间。
答:用户和内核cpu时间是分开测的,小心!
编辑:pTimes 现在在每个基准之间重置为零,但结果变得更奇怪了!
为了摆弄我自己的自定义内存管理方案,我为 Visual Community 2019 中现有的 malloc() 做了一个简单的基准测试,Windows10。出于兴趣,我对 CPU 时间和墙时间,我通过在许多块中分配一大块内存来测试 malloc,然后单独释放每个块而不使用它们。看这里:
void malloc_distr256(int nMemsize) {
long long pFreeList[256];
for (int i = 0; i < 256; ++i) pFreeList[i] = malloc(nMemsize >> 8);
for (int i = 0; i < 256; ++i) free((void*)pFreeList[i]);
}
void malloc_distr64(int nMemsize) {
long long pFreeList[64];
for (int i = 0; i < 64; ++i) pFreeList[i] = malloc(nMemsize >> 6);
for (int i = 0; i < 64; ++i) free((void*)pFreeList[i]);
}
void malloc_distr0(int nMemsize) {
void* pMem = malloc(nMemsize);
free(pMem);
}
我使用以下代码对这些函数进行了基准测试——“BenchTimes”只是一个包含双 CPU/wall 次的结构:
inline double cputime() {
FILETIME lpCreationTime;
FILETIME lpExitTime;
FILETIME lpKernelTime;
FILETIME lpUserTime;
if (GetProcessTimes(GetCurrentProcess(), &lpCreationTime, &lpExitTime, &lpKernelTime, &lpUserTime)) {
double dUnits = (double)(lpUserTime.dwLowDateTime | (long long)lpUserTime.dwHighDateTime << 32);
return dUnits * 0.1;
}
else return 0xFFF0000000000000;
}
inline double walltime() {
LARGE_INTEGER lnFreq, lnTime;
if (QueryPerformanceFrequency(&lnFreq)) if (QueryPerformanceCounter(&lnTime))
return 1000000.0 * (double)lnTime.QuadPart / (double)lnFreq.QuadPart;
//multiply by 1,000,000 to convert seconds to microseconds
//because the cpu time measurer I had in microseconds as well
return 0.0;
}
void bench(void (pfnFunc)(int), int nMemsize, int nIters, int nReps, BenchTimes* pTimes) {
pTimes->dCpuTime = 0.0;
pTimes->dWallTime = 0.0;
for (volatile int r = 0; r < nReps; ++r) {
double dCpuStart = cputime();
double dWallStart = walltime();
for (volatile int i = 0; i < nIters; ++i) pfnFunc(nMemsize);
double dCpuEnd = cputime();
double dWallEnd = walltime();
double dCpuDiff = dCpuEnd - dCpuStart;
double dWallDiff = dWallEnd - dWallStart;
pTimes->dCpuTime += dCpuDiff;
pTimes->dWallTime += dWallDiff;
}
}
这些是在我的电脑 (i5-9400f) 上测得的时间,以秒为单位。
我很好奇性能和墙时间与 CPU 时间比较的巨大差异!
运行 的代码在这里:
BenchTimes sTimes;
bench(malloc_distr256, 1 << 20, 100, 1000, &sTimes);
fprintf(stdout, "Malloc alloc/free bench allocated %lf megabytes, distributed over 256 chunks\n", (double)(1 << 20) / 1000000);
fprintf(stdout, "Malloc alloc/free bench returned:\nWalltime - total: %lf\nCPU Time - total: %lf\n", sTimes.dWallTime / 1000000, sTimes.dCpuTime / 1000000);
bench(malloc_distr64, 1 << 20, 100, 1000, &sTimes);
fprintf(stdout, "\nMalloc alloc/free bench allocated %lf megabytes, distributed over 64 chunks\n", (double)(1 << 20) / 1000000);
fprintf(stdout, "\nMalloc alloc/free bench returned:\nWalltime - total: %lf\nCPU Time - total: %lf\n", sTimes.dWallTime / 1000000, sTimes.dCpuTime / 1000000);
bench(malloc_distr0, 1 << 20, 100, 1000, &sTimes);
fprintf(stdout, "\nMalloc alloc/free bench allocated %lf megabytes, distributed over no chunks\n", (double)(1 << 20) / 1000000);
fprintf(stdout, "\nMalloc alloc/free bench returned:\nWalltime - total: %lf\nCPU Time - total: %lf\n", sTimes.dWallTime / 1000000, sTimes.dCpuTime / 1000000);
system("pause");
malloc
是通过 HeapAlloc
实现的,它在名为 RtlAllocateHeap
.
该函数管理堆。它通过 VirtualAlloc[Ex]
或其等价物分配系统内存页面,并在这些页面内提供较小的分配。
对于较大的分配,VirtualAlloc[Ex]
等价物在每次分配时都会被调用,对于较小的分配,它偶尔会被调用。
VirtualAlloc[Ex]
是使用内核调用 NtAllocateVirtualMemory
实现的。花在里面的时间大部分都不算在lpUserTime
.
另一方面,QueryPerformanceCounter
是诚实的总时间。