执行我的简单函数需要多少 CPU 个周期?
How many CPU cycles does it take to execute my simple function?
出于教育目的,我想知道在优化(在不同级别)和编译后执行函数需要多少 CPU 个周期。有没有办法分析代码或可执行文件以获得可重现的答案?我正在使用 Eclipse Luna,在 64 位 Windows 7 Pro 上使用 MinGW。
#include <math.h>
#include "main.h"
#define EPS 1e-15 // EPS a small number ~ machine precision
#define R2D 57.295779513082320876798154814105 //multiply radian with R2D to get degrees
#define D2R 0.01745329251994329576923690768489 //multiply degrees with D2R to get radians
#define TWO_PI 6.283185307179586476925286766559 //2*Pi
double _stdcall CourseInitial (double *lat1, double *lon1, double *lat2, double *lon2)
{
double radLat1 = D2R * *lat1;
double radLat2 = D2R * *lat2;
double radDeltaLon = D2R * (*lon2 - *lon1);
double tc = 0;
if (cos(radLat1) < EPS) { // EPS a small number ~ machine precision
if (radLat1 > 0) {
tc = 180; // Starting at N pole
} else {
tc = 0; // Starting at S pole
}
} else {
// Calculate true course [-180, 180)
tc = R2D * atan2(sin(radDeltaLon),
cos(radLat1) * tan(radLat2) - sin(radLat1) * cos(radDeltaLon)
);
}
if (fabs(tc) < EPS) {
tc = 0; //Prevents fmod(tc, 360) from returning 360 due to rounding error
} else {
tc += 360; //tc [180, 540)
}
return fmod(tc, 360); // returns tc [0, 360)
}
int main(void)
{
double lat1 = 67
double lon1 = 15;
double lat2 = 71;
double lon2 = 24;
double tc = 0;
tc = CourseInitial(&lat1, &lon1, &lat2, &lon2);
printf("The course from point 1 to 2 is: %.1f\n", tc);
return 0;
}
我这样做了:
#include <math.h>
#include <cstdlib>
#include <cstdio>
#define EPS 1e-15 // EPS a small number ~ machine precision
#define R2D 57.295779513082320876798154814105 //multiply radian with R2D to get degrees
#define D2R 0.01745329251994329576923690768489 //multiply degrees with D2R to get radians
#define TWO_PI 6.283185307179586476925286766559 //2*Pi
double CourseInitial (double *lat1, double *lon1, double *lat2, double *lon2)
{
double radLat1 = D2R * *lat1;
double radLat2 = D2R * *lat2;
double radDeltaLon = D2R * (*lon2 - *lon1);
double tc = 0;
if (cos(radLat1) < EPS) { // EPS a small number ~ machine precision
if (radLat1 > 0) {
tc = 180; // Starting at N pole
} else {
tc = 0; // Starting at S pole
}
} else {
// Calculate true course [-180, 180)
tc = R2D * atan2(sin(radDeltaLon),
cos(radLat1) * tan(radLat2) - sin(radLat1) * cos(radDeltaLon)
);
}
if (fabs(tc) < EPS) {
tc = 0; //Prevents fmod(tc, 360) from returning 360 due to rounding error
} else {
tc += 360; //tc [180, 540)
}
return fmod(tc, 360); // returns tc [0, 360)
}
struct LatLon
{
double lat, lon;
};
struct CoursePoint
{
LatLon a, b;
};
const int SIZE = 1000000;
CoursePoint cps[SIZE];
double tc[SIZE];
LatLon RandomLatLon()
{
LatLon l;
l.lat = rand() % 90;
l.lon = rand() % 60;
return l;
}
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
int main(void)
{
for(int i = 0; i < SIZE; i++)
{
cps[i].a = RandomLatLon();
cps[i].b = RandomLatLon();
}
unsigned long long t = rdtsc();
for(int i = 0; i < SIZE; i++)
{
tc[i] = CourseInitial(&cps[i].a.lat, &cps[i].a.lon, &cps[i].b.lat, &cps[i].b.lon);
}
t = rdtsc() - t;
printf("Time=%f\n", t/(double)SIZE);
double tot = 0;
for(int i = 0; i < SIZE; i++)
{
tot += tc[i];
}
printf("Sum of courses: %f\n", tot);
return 0;
}
每次迭代大约有 850-1000 个周期。
如果你不运行像这样的长循环,虚假的东西会影响实际性能。编译器优化选项产生的差异很小,添加 -ffast-math 产生的差异比 -O0 与 -O3 的差异更大。不同的编译器也有一些不同。
g++ 4.9.2:
-O0 1013 cycles
-O1 879 cycles
-O2 878 cycles
-O3 877 cycles
Add -ffast-math:
-O0 978
-O1 855 (re-run gives 890)
-O2 882
-O3 848 (re-run gives 850)
Clang++(几周前为 3.7):
-O0 998 cycles
-O1 954 cycles
-O2 955 cycles
-O3 957 cycles
Add -ffast-math:
-O0 967
-O1 872
-O2 865
-O3 875
Clang++ 截至昨天:
-O0 1001 cycles
-O1 956 cycles
-O2 948 cycles
-O3 949 cycles
Add -ffast-math:
-O0 969
-O1 871
-O2 869
-O3 873
请注意,小于 10 个时钟周期的差异可能在统计上不显着。我确实报告过 运行 一次,但在此之前我尝试了几次以确保大多数时候,我得到了相同的(大概)答案。
请注意,不同的处理器会给出完全不同的结果,不同的编译器版本也明显存在一些差异。
编辑:为了好玩,我将把它重写为 Pascal 并通过我的 Pascal 编译器 运行 看看它的作用。
使用我的 Pascal 编译器编译的代码使用 -O2(可用的最高级别)需要 881 个时钟周期。
FreePascal,即 "official" Linux Pascal 编译器没有可用的时钟周期计数器,所以我只做了 time ./course
,结果大约为 0.44 秒,我的编译器代码是 0.37s.
出于教育目的,我想知道在优化(在不同级别)和编译后执行函数需要多少 CPU 个周期。有没有办法分析代码或可执行文件以获得可重现的答案?我正在使用 Eclipse Luna,在 64 位 Windows 7 Pro 上使用 MinGW。
#include <math.h>
#include "main.h"
#define EPS 1e-15 // EPS a small number ~ machine precision
#define R2D 57.295779513082320876798154814105 //multiply radian with R2D to get degrees
#define D2R 0.01745329251994329576923690768489 //multiply degrees with D2R to get radians
#define TWO_PI 6.283185307179586476925286766559 //2*Pi
double _stdcall CourseInitial (double *lat1, double *lon1, double *lat2, double *lon2)
{
double radLat1 = D2R * *lat1;
double radLat2 = D2R * *lat2;
double radDeltaLon = D2R * (*lon2 - *lon1);
double tc = 0;
if (cos(radLat1) < EPS) { // EPS a small number ~ machine precision
if (radLat1 > 0) {
tc = 180; // Starting at N pole
} else {
tc = 0; // Starting at S pole
}
} else {
// Calculate true course [-180, 180)
tc = R2D * atan2(sin(radDeltaLon),
cos(radLat1) * tan(radLat2) - sin(radLat1) * cos(radDeltaLon)
);
}
if (fabs(tc) < EPS) {
tc = 0; //Prevents fmod(tc, 360) from returning 360 due to rounding error
} else {
tc += 360; //tc [180, 540)
}
return fmod(tc, 360); // returns tc [0, 360)
}
int main(void)
{
double lat1 = 67
double lon1 = 15;
double lat2 = 71;
double lon2 = 24;
double tc = 0;
tc = CourseInitial(&lat1, &lon1, &lat2, &lon2);
printf("The course from point 1 to 2 is: %.1f\n", tc);
return 0;
}
我这样做了:
#include <math.h>
#include <cstdlib>
#include <cstdio>
#define EPS 1e-15 // EPS a small number ~ machine precision
#define R2D 57.295779513082320876798154814105 //multiply radian with R2D to get degrees
#define D2R 0.01745329251994329576923690768489 //multiply degrees with D2R to get radians
#define TWO_PI 6.283185307179586476925286766559 //2*Pi
double CourseInitial (double *lat1, double *lon1, double *lat2, double *lon2)
{
double radLat1 = D2R * *lat1;
double radLat2 = D2R * *lat2;
double radDeltaLon = D2R * (*lon2 - *lon1);
double tc = 0;
if (cos(radLat1) < EPS) { // EPS a small number ~ machine precision
if (radLat1 > 0) {
tc = 180; // Starting at N pole
} else {
tc = 0; // Starting at S pole
}
} else {
// Calculate true course [-180, 180)
tc = R2D * atan2(sin(radDeltaLon),
cos(radLat1) * tan(radLat2) - sin(radLat1) * cos(radDeltaLon)
);
}
if (fabs(tc) < EPS) {
tc = 0; //Prevents fmod(tc, 360) from returning 360 due to rounding error
} else {
tc += 360; //tc [180, 540)
}
return fmod(tc, 360); // returns tc [0, 360)
}
struct LatLon
{
double lat, lon;
};
struct CoursePoint
{
LatLon a, b;
};
const int SIZE = 1000000;
CoursePoint cps[SIZE];
double tc[SIZE];
LatLon RandomLatLon()
{
LatLon l;
l.lat = rand() % 90;
l.lon = rand() % 60;
return l;
}
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
int main(void)
{
for(int i = 0; i < SIZE; i++)
{
cps[i].a = RandomLatLon();
cps[i].b = RandomLatLon();
}
unsigned long long t = rdtsc();
for(int i = 0; i < SIZE; i++)
{
tc[i] = CourseInitial(&cps[i].a.lat, &cps[i].a.lon, &cps[i].b.lat, &cps[i].b.lon);
}
t = rdtsc() - t;
printf("Time=%f\n", t/(double)SIZE);
double tot = 0;
for(int i = 0; i < SIZE; i++)
{
tot += tc[i];
}
printf("Sum of courses: %f\n", tot);
return 0;
}
每次迭代大约有 850-1000 个周期。
如果你不运行像这样的长循环,虚假的东西会影响实际性能。编译器优化选项产生的差异很小,添加 -ffast-math 产生的差异比 -O0 与 -O3 的差异更大。不同的编译器也有一些不同。
g++ 4.9.2:
-O0 1013 cycles
-O1 879 cycles
-O2 878 cycles
-O3 877 cycles
Add -ffast-math:
-O0 978
-O1 855 (re-run gives 890)
-O2 882
-O3 848 (re-run gives 850)
Clang++(几周前为 3.7):
-O0 998 cycles
-O1 954 cycles
-O2 955 cycles
-O3 957 cycles
Add -ffast-math:
-O0 967
-O1 872
-O2 865
-O3 875
Clang++ 截至昨天:
-O0 1001 cycles
-O1 956 cycles
-O2 948 cycles
-O3 949 cycles
Add -ffast-math:
-O0 969
-O1 871
-O2 869
-O3 873
请注意,小于 10 个时钟周期的差异可能在统计上不显着。我确实报告过 运行 一次,但在此之前我尝试了几次以确保大多数时候,我得到了相同的(大概)答案。
请注意,不同的处理器会给出完全不同的结果,不同的编译器版本也明显存在一些差异。
编辑:为了好玩,我将把它重写为 Pascal 并通过我的 Pascal 编译器 运行 看看它的作用。
使用我的 Pascal 编译器编译的代码使用 -O2(可用的最高级别)需要 881 个时钟周期。
FreePascal,即 "official" Linux Pascal 编译器没有可用的时钟周期计数器,所以我只做了 time ./course
,结果大约为 0.44 秒,我的编译器代码是 0.37s.