如何测量 x86 纳秒以下的运行时间？

Question

我搜索并使用了很多方法来测量经过的时间。为此有很多问题。比如this这个问题很好，但是当你需要一个准确的时间记录器时，我找不到好的方法。为此，我想在这里分享我的方法，以供使用，如果有错误，请更正。

更新&注意：这个问题是针对基准测试的，不到一纳秒。它与使用 clock_gettime(CLOCK_MONOTONIC,&start); 完全不同，它记录的时间超过一纳秒。

更新： 衡量加速比的常用方法是重复程序中应该进行基准测试的部分。但是，正如评论中提到的，当研究人员依赖自动矢量化时，它可能会显示不同的优化。

注意测量一次重复的经过时间不够准确。在某些情况下，我的结果表明该部分必须重复超过 1K 或 1M 才能获得最短时间。

建议 : 我不熟悉 shell 编程（只知道一些基本命令...）但是，测量最小时间是可能的无需在程序中重复。

我当前的解决方案 为了防止出现分支，我使用宏 #define REP_CODE(X) X X X... X X 重复 ode 部分，其中 X 是我要进行基准测试的代码部分，如下所示：

//numbers
#define FMAX1 MAX1*MAX1
#define COEFF 8 
int __attribute__(( aligned(32))) input[FMAX1+COEFF];           //= {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17};
int __attribute__(( aligned(32))) output[FMAX1];
int __attribute__(( aligned(32))) coeff[COEFF] = {1,2,3,4,5,6,7,8};//= {1,1,1,1,1,1,1,1};//;            //= {1,2,1,2,1,2,1,2,2,1};

int main()
{
    REP_CODE(
        t1_rdtsc=_rdtsc();
        //Code
        for(i = 0; i < FMAX1; i++){
            for(j = 0; j < COEFF; j++){//IACA_START
                output[i] += coeff[j] * input[i+j]; 

            }//IACA_END
        }
        t2_rdtsc=_rdtsc();
        ttotal_rdtsc[ii++]=t2_rdtsc-t1_rdtsc;
        )
    // The smallest element in `ttotal_rdtsc` is the answer
}

这不影响优化，但在某些情况下受代码大小和编译时间过多的限制。

有什么建议和更正吗？

提前致谢。

Answer 1

我建议对 x86 使用此方法 micro-architecture。

注意：

NUM_LOOP 应该是有助于提高准确性的数字重复您的代码以记录最佳时间
ttbest_rdtsc必须比我建议最大化它的最坏时间要大。
我使用（你可能不想要它）OVERAL_TIME 作为另一个检查规则，因为我将它用于许多内核，在某些情况下 NUM_LOOP 非常大，我没有'想改变它。我计划 OVERAL_TIME 限制迭代并在特定时间后停止。

更新：整个程序是这样的：

#include <stdio.h>
#include <x86intrin.h>

#define NUM_LOOP 100 //executes your code NUM_LOOP times to get the smalest time to avoid overheads such as cache misses, etc.

int main()
{
    long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc, ttbest_rdtsc = 99999999999999999;
    int do_while = 0;
    do{

        t1_rdtsc = _rdtsc();
            //put your code here
        t2_rdtsc = _rdtsc();

        ttotal_rdtsc = t2_rdtsc - t1_rdtsc;

        //store the smalest time:
        if (ttotal_rdtsc<ttbest_rdtsc)
            ttbest_rdtsc = ttotal_rdtsc;

    }while (do_while++ < NUM_LOOP); 

    printf("\nthe best is %lld in %d repetitions\n", ttbest_rdtsc, NUM_LOOP );

    return 0;
}

我已经对此进行了更改并为我自己添加了一个 header 然后我可以在我的程序中简单地使用它。

#include <x86intrin.h>
#define do_while NUM_LOOP
#define OVERAL_TIME 999999999
long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc, ttbest_rdtsc = 99999999999999999, elapsed, elapsed_rdtsc=do_while, overal_time = OVERAL_TIME, ttime=0;
#define begin_rdtsc\
                do{\
                    t1_rdtsc=_rdtsc();

#define end_rdtsc\
                    t2_rdtsc=_rdtsc();\
                    ttotal_rdtsc=t2_rdtsc-t1_rdtsc;\
                    if (ttotal_rdtsc<ttbest_rdtsc){\
                        ttbest_rdtsc = ttotal_rdtsc;\
                        elapsed=(do_while-elapsed_rdtsc);}\
                    ttime+=ttotal_rdtsc;\
                }while (elapsed_rdtsc-- && (ttime<overal_time));\
                printf("\nthe best is %lld in %lldth iteration and %lld repetitions\n", ttbest_rdtsc, elapsed, (do_while-elapsed_rdtsc));

如何使用这个方法？嗯，很简单！

int main()
{
    //before the section
    begin_rdtsc
       //put your code here to measure the clocks.
    end_rdtsc
    return 0
}

要有创意，您可以更改它来衡量程序的加速比等。输出示例为：

the best is 9600 in 384751th iteration and 569179 repetitions

我的测试代码得到了 9600 时钟，最好的记录是在 384751end 迭代中，我的代码被测试了 569179 次

我已经在 GCC 和 Clang 上测试了它们。

Answer 2

我已经制定了我的第一个答案并得到了这个解决方案。但是，我仍然想要一个解决方案。因为准确且影响最小的时间测量非常重要。我将这部分放在头文件中，并将其包含在主程序文件中。

//Header file header.h
#define count 1000 // number of repetition 
long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc[count], ttbest_rdtsc = 99999999999999999, elapsed,  elapsed_rdtsc=count, overal_time = OVERAL_TIME, ttime=0;
int ii=0;
#define begin_rdtsc\
                    do{\
                        t1_rdtsc=_rdtsc();

#define end_rdtsc\
                        t2_rdtsc=_rdtsc();\
                        ttotal_rdtsc[ii]=t2_rdtsc-t1_rdtsc;\
                    }while (ii++<count);\   
                    for(ii=0; ii<do_while; ii++){\
                        if (ttotal_rdtsc[ii]<ttbest_rdtsc){\
                            ttbest_rdtsc = ttotal_rdtsc[ii];}}\             
                    printf("\nthe best is %lld in %lldth iteration \n", ttbest_rdtsc, elapsed_rdtsc);

//Main program
#include "header.h"
.
.
.
int main()
{
    //before the section
    begin_rdtsc
       //put your code here to measure the clocks.
    end_rdtsc
    return 0
}

Answer 3

如果您对自动矢量器有问题并想限制它，只需在 begin_rdtsc 之后添加一个 asm("#somthing");，它将分隔 do-while 循环。我刚刚检查了一下，它对您发布的代码进行了矢量化，而自动矢量化器无法对其进行矢量化。我更改了您的宏，您可以使用它....

long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc[do_while], ttbest_rdtsc = 99999999999999999, elapsed,  elapsed_rdtsc=do_while, overal_time = OVERAL_TIME, ttime=0;
int ii=0;
    #define begin_rdtsc\
                    do{\
                        asm("#mmmmmmmmmmm");\
                        t1_rdtsc=_rdtsc();

    #define end_rdtsc\
                        t2_rdtsc=_rdtsc();\
                        asm("#mmmmmmmmmmm");\
                        ttotal_rdtsc[ii]=t2_rdtsc-t1_rdtsc;\
                    }while (ii++<do_while);\    
                    for(ii=0; ii<do_while; ii++){\
                        if (ttotal_rdtsc[ii]<ttbest_rdtsc){\
                            ttbest_rdtsc = ttotal_rdtsc[ii];}}\             
                    printf("\nthe best is %lld in %lld iteration\n", ttbest_rdtsc, elapsed_rdtsc);

如何测量 x86 纳秒以下的运行时间？

How to measure the elapsead time below nanosecond for x86?

c

performance

x86

intrinsics