执行我的简单函数需要多少 CPU 个周期？

Question

出于教育目的，我想知道在优化（在不同级别）和编译后执行函数需要多少 CPU 个周期。有没有办法分析代码或可执行文件以获得可重现的答案？我正在使用 Eclipse Luna，在 64 位 Windows 7 Pro 上使用 MinGW。

#include <math.h>
#include "main.h"

#define EPS 1e-15 // EPS a small number ~ machine precision
#define R2D 57.295779513082320876798154814105   //multiply radian with R2D to get degrees
#define D2R 0.01745329251994329576923690768489  //multiply degrees with D2R to get radians
#define TWO_PI 6.283185307179586476925286766559 //2*Pi


double _stdcall CourseInitial (double *lat1, double *lon1, double *lat2, double *lon2)
{
    double radLat1     = D2R *  *lat1;
    double radLat2     = D2R *  *lat2;
    double radDeltaLon = D2R * (*lon2 - *lon1);
    double tc = 0;

    if (cos(radLat1) < EPS) {  // EPS a small number ~ machine precision
        if (radLat1 > 0) {
          tc = 180;            // Starting at N pole
        } else {
          tc = 0;              // Starting at S pole
        }
    } else {
      // Calculate true course [-180, 180)
      tc = R2D * atan2(sin(radDeltaLon),
                       cos(radLat1) * tan(radLat2) - sin(radLat1) * cos(radDeltaLon)
                      );
    }

    if (fabs(tc) < EPS) {
        tc = 0;  //Prevents fmod(tc, 360) from returning 360 due to rounding error
    } else {
        tc += 360; //tc [180, 540)
    }
    return fmod(tc, 360); // returns tc [0, 360)
}

int main(void)
{
    double lat1 = 67
    double lon1 = 15;
    double lat2 = 71;
    double lon2 = 24;
    double tc = 0;
    tc = CourseInitial(&lat1, &lon1, &lat2, &lon2);
    printf("The course from point 1 to 2 is: %.1f\n", tc);

    return 0;
}

Answer 1

我这样做了：

#include <math.h>
#include <cstdlib>
#include <cstdio>

#define EPS 1e-15 // EPS a small number ~ machine precision
#define R2D 57.295779513082320876798154814105   //multiply radian with R2D to get degrees
#define D2R 0.01745329251994329576923690768489  //multiply degrees with D2R to get radians
#define TWO_PI 6.283185307179586476925286766559 //2*Pi


double CourseInitial (double *lat1, double *lon1, double *lat2, double *lon2)
{
    double radLat1     = D2R *  *lat1;
    double radLat2     = D2R *  *lat2;
    double radDeltaLon = D2R * (*lon2 - *lon1);
    double tc = 0;

    if (cos(radLat1) < EPS) {  // EPS a small number ~ machine precision
        if (radLat1 > 0) {
          tc = 180;            // Starting at N pole
        } else {
          tc = 0;              // Starting at S pole
        }
    } else {
      // Calculate true course [-180, 180)
      tc = R2D * atan2(sin(radDeltaLon),
                       cos(radLat1) * tan(radLat2) - sin(radLat1) * cos(radDeltaLon)
                      );
    }

    if (fabs(tc) < EPS) {
        tc = 0;  //Prevents fmod(tc, 360) from returning 360 due to rounding error
    } else {
        tc += 360; //tc [180, 540)
    }
    return fmod(tc, 360); // returns tc [0, 360)
}

struct LatLon
{
    double lat, lon; 
};

struct CoursePoint
{
    LatLon a, b;
};

const int SIZE = 1000000;

CoursePoint cps[SIZE];
double tc[SIZE];

LatLon RandomLatLon()
{
    LatLon l;
    l.lat = rand() % 90;
    l.lon = rand() % 60;
    return l;
}

static __inline__ unsigned long long rdtsc(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

int main(void)
{
    for(int i = 0; i < SIZE; i++)
    {
    cps[i].a = RandomLatLon();
    cps[i].b = RandomLatLon();
    }
    unsigned long long t = rdtsc();
    for(int i = 0; i < SIZE; i++)
    {
    tc[i] = CourseInitial(&cps[i].a.lat, &cps[i].a.lon, &cps[i].b.lat, &cps[i].b.lon);
    }
    t = rdtsc() - t;
    printf("Time=%f\n", t/(double)SIZE);
    double tot = 0;
    for(int i = 0; i < SIZE; i++)
    {
    tot += tc[i];
    }
    printf("Sum of courses: %f\n", tot);

    return 0;
}

每次迭代大约有 850-1000 个周期。

如果你不运行像这样的长循环，虚假的东西会影响实际性能。编译器优化选项产生的差异很小，添加 -ffast-math 产生的差异比 -O0 与 -O3 的差异更大。不同的编译器也有一些不同。

g++ 4.9.2:

-O0    1013 cycles
-O1     879 cycles
-O2     878 cycles
-O3     877 cycles
Add -ffast-math:
-O0     978
-O1     855  (re-run gives 890)
-O2     882
-O3     848  (re-run gives 850)

Clang++（几周前为 3.7）：

-O0     998 cycles
-O1     954 cycles
-O2     955 cycles
-O3     957 cycles
Add -ffast-math:
-O0     967
-O1     872
-O2     865
-O3     875

Clang++ 截至昨天：

-O0    1001 cycles
-O1     956 cycles
-O2     948 cycles
-O3     949 cycles
Add -ffast-math:
-O0     969
-O1     871
-O2     869
-O3     873

请注意，小于 10 个时钟周期的差异可能在统计上不显着。我确实报告过运行一次，但在此之前我尝试了几次以确保大多数时候，我得到了相同的（大概）答案。

请注意，不同的处理器会给出完全不同的结果，不同的编译器版本也明显存在一些差异。

编辑：为了好玩，我将把它重写为 Pascal 并通过我的 Pascal 编译器运行看看它的作用。

使用我的 Pascal 编译器编译的代码使用 -O2（可用的最高级别）需要 881 个时钟周期。

FreePascal，即 "official" Linux Pascal 编译器没有可用的时钟周期计数器，所以我只做了 time ./course，结果大约为 0.44 秒，我的编译器代码是 0.37s.

执行我的简单函数需要多少 CPU 个周期？

How many CPU cycles does it take to execute my simple function?

c

c++

optimization

profiling