C++ - Fastor vs. Eigen vs C-tran:张量收缩的速度差异

C++ - Fastor vs. Eigen vs C-tran: Speed differences in tensor contraction

我目前正在寻找一个 C++ 库(最好只有头文件),用于使用爱因斯坦求和表示法进行高阶张量收缩。

Fastor (https://github.com/romeric/Fastor) 似乎非常适合,并且由于 Eigen(我经常使用它)有一个张量模块,我做了一个小的比较,包括一个简单循环实现的基准:

#include <Fastor.h>
#include "unsupported/Eigen/CXX11/Tensor"
#include <ctime>

int main() {
clock_t tstart, tend; 
 {
    using namespace Fastor;
    Tensor<double,20,50,10> A; 
    Tensor<double,50,10,20,4> B;
    Tensor<double, 4> C;
    Tensor<double, 10, 10, 4> D;

    A.ones();
    B.ones();
    C.zeros();
    D.zeros();
    enum {I,J,K,L,M,N};

    tstart = clock();

    C += einsum<Index<I,J,K>,Index<J,K,I,M>>(A,B);

    tend = clock(); 
    std::cout << "Fastor, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;

    tstart = clock();

    D += einsum<Index<I,J,K>,Index<J,M,I,N>>(A,B);

    tend = clock(); 
    std::cout << "Fastor, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;
 }

 {
     using namespace Eigen;

    TensorFixedSize<double, Sizes<20, 50, 10>> A;
    TensorFixedSize<double, Sizes<50,10, 20, 4>> B;
    TensorFixedSize<double, Sizes<4>> C;
    TensorFixedSize<double, Sizes<10,10,4>> D;

    A.setConstant(1);
    B.setConstant(1);
    C.setConstant(0);
    D.setConstant(0);

     array<IndexPair<int>,3> IJK_JKIM = {
         IndexPair<int>(0, 2),
         IndexPair<int>(1, 0),
         IndexPair<int>(2, 1),
     };

     array<IndexPair<int>,2> IJK_JMIN = {
         IndexPair<int>(0, 2),
         IndexPair<int>(1, 0),
     };

    tstart = clock();
    C += A.contract(  B,  IJK_JKIM) ;
    tend = clock(); 

    std::cout << "Eigen, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;

    tstart = clock();
     D += A.contract(  B,  IJK_JMIN) ;
    tend = clock(); 

    std::cout << "Eigen, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;


 }

 {
     using namespace Eigen;

     double A [20][50][10]  = {1} ;
     double B [50][10][20][4]  = {1};
     double C [4]  = {};
     double D [10][10][4]  = {};

    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                A[i][j][k] = 1.0;

    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                for ( int l = 0; l < 4; l++)
                    B[j][k][i][l] = 1.0;


    tstart = clock();

    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                for ( int l = 0; l < 4; l++)
                    C[l] += A[i][j][k] * B [j][k][i][l];

    tend = clock(); 
    std::cout << "CTran, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;

    tstart = clock();
    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                for ( int m = 0; m < 10; m++)
                    for ( int n = 0; n < 4; n++)
                        D[k][m][n] += A[i][j][k] * B [j][m][i][n];

    tend = clock(); 

    std::cout << "CTran, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;

 }

return 0;
}

我的输出是:

Fastor, C_M = A_IJK * B_JKIM:   206
Fastor, D_KMN = A_IJ * B_JMIN:  2230
Eigen, C_M = A_IJK * B_JKIM:    1286
Eigen, D_KMN = A_IJ * B_JMIN:   898
CTran, C_M = A_IJK * B_JKIM:    2
CTran, D_KMN = A_IJ * B_JMIN:   2

使用

使用 g++ 9.1.0 (Arch Linux) 编译
g++ test.cpp -O3 -std=c++17  -I Fastor -isystem eigen-eigen-323c052e1731 -o test

因此,对于示例 1,Fastor 似乎比 Eigen 快得多,但对于示例 2 则慢得多。 然而,这两个库都比简单的循环实现慢得令人难以置信! 这些相互矛盾的结果有解释吗?提前致谢!

对于您 "simple loop" 基准测试,GCC 可以查看所有计算并完全删除所有计算。在 Godbolt 上查看互动:https://www.godbolt.org/z/-gPPVH.

您的 "simple loop" 片段的程序集如下:

foo():
        push    rbp
        push    rbx
        sub     rsp, 8
        call    clock
        mov     rbp, rax
        call    clock
        mov     edx, 29
        mov     esi, OFFSET FLAT:.LC0
        mov     edi, OFFSET FLAT:_ZSt4cout
        mov     rbx, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        sub     rbx, rbp
        mov     edi, OFFSET FLAT:_ZSt4cout
        mov     rsi, rbx
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<long>(long)
        mov     rbp, rax
        mov     rax, QWORD PTR [rax]
        mov     rax, QWORD PTR [rax-24]
        mov     rbx, QWORD PTR [rbp+240+rax]
        test    rbx, rbx
        je      .L7
        cmp     BYTE PTR [rbx+56], 0
        je      .L5
        movsx   esi, BYTE PTR [rbx+67]
.L6:
        mov     rdi, rbp
        call    std::basic_ostream<char, std::char_traits<char> >::put(char)
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >::flush()
        call    clock
        mov     rbp, rax
        call    clock
        mov     edx, 30
        mov     esi, OFFSET FLAT:.LC1
        mov     edi, OFFSET FLAT:_ZSt4cout
        mov     rbx, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        mov     rsi, rbx
        mov     edi, OFFSET FLAT:_ZSt4cout
        sub     rsi, rbp
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<long>(long)
        mov     rbp, rax
        mov     rax, QWORD PTR [rax]
        mov     rax, QWORD PTR [rax-24]
        mov     rbx, QWORD PTR [rbp+240+rax]
        test    rbx, rbx
        je      .L7
        cmp     BYTE PTR [rbx+56], 0
        je      .L8
        movsx   esi, BYTE PTR [rbx+67]
.L9:
        mov     rdi, rbp
        call    std::basic_ostream<char, std::char_traits<char> >::put(char)
        add     rsp, 8
        pop     rbx
        mov     rdi, rax
        pop     rbp
        jmp     std::basic_ostream<char, std::char_traits<char> >::flush()
.L5:
        mov     rdi, rbx
        call    std::ctype<char>::_M_widen_init() const
        mov     rax, QWORD PTR [rbx]
        mov     esi, 10
        mov     rax, QWORD PTR [rax+48]
        cmp     rax, OFFSET FLAT:std::ctype<char>::do_widen(char) const
        je      .L6
        mov     rdi, rbx
        call    rax
        movsx   esi, al
        jmp     .L6
.L8:
        mov     rdi, rbx
        call    std::ctype<char>::_M_widen_init() const
        mov     rax, QWORD PTR [rbx]
        mov     esi, 10
        mov     rax, QWORD PTR [rax+48]
        cmp     rax, OFFSET FLAT:std::ctype<char>::do_widen(char) const
        je      .L9
        mov     rdi, rbx
        call    rax
        movsx   esi, al
        jmp     .L9
.L7:
        call    std::__throw_bad_cast()

如您所见,编译器已完全删除所有计算。

最大的影响因素是计算结果没有在任何地方使用。这允许编译器将计算删除为 "dead code".

奇怪的是,Clang 和 MSVC(请参阅上面的 link 到 goldbolt)都没有设法完全删除计算。 Clang 取消了拳头收缩,但没有取消第二个。


这是进行微基准测试时的常见问题。 Google benchmark 等成熟的基准测试框架可以通过将某些变量标记为不可优化的方式来缓解这种情况,迫使编译器实际计算这些变量的值。我强烈建议使用这些框架之一。

您可以使用 Fastor 的 unusedtimeit 函数进行适当的基准测试,而不必担心编译器是否会消除您的死代码。

我会把你的基准写成如下:

constexpr int FIFTY =  50;
constexpr int TWETY =  20;
constexpr int TEN =  10;
constexpr int FOUR =  4;

using real_ = double;

real_ A [TWETY][FIFTY][TEN]  = {1} ;
real_ B [FIFTY][TEN][TWETY][FOUR]  = {1};
real_ C [FOUR]  = {};
real_ D [TEN][TEN][FOUR]  = {};


Fastor::Tensor<real_,TWETY,FIFTY,TEN> fA;
Fastor::Tensor<real_,FIFTY,TEN,TWETY,FOUR> fB;

Eigen::TensorFixedSize<real_, Eigen::Sizes<TWETY, FIFTY, TEN>> eA;
Eigen::TensorFixedSize<real_, Eigen::Sizes<FIFTY,TEN, TWETY, FOUR>> eB;
Eigen::TensorFixedSize<real_, Eigen::Sizes<FOUR>> eC;
Eigen::TensorFixedSize<real_, Eigen::Sizes<TEN,TEN,FOUR>> eD;


void CTran_C() {

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                A[i][j][k] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int l = 0; l < FOUR; l++)
                    B[j][k][i][l] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int l = 0; l < FOUR; l++)
                    C[l] += A[i][j][k] * B[j][k][i][l];

    Fastor::unused(A);
    Fastor::unused(B);
    Fastor::unused(C);
}

void CTran_D() {

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                A[i][j][k] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int l = 0; l < FOUR; l++)
                    B[j][k][i][l] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int m = 0; m < TEN; m++)
                    for ( int n = 0; n < FOUR; n++)
                        D[k][m][n] += A[i][j][k] * B [j][m][i][n];

    Fastor::unused(A);
    Fastor::unused(B);
    Fastor::unused(D);
}


void Fastor_C() {

    using namespace Fastor;
    enum {I,J,K,L,M,N};

    fA.ones();
    fB.ones();

    auto fC = einsum<Index<I,J,K>,Index<J,K,I,M>>(fA,fB);

    unused(fA);
    unused(fB);
    unused(fC);
}

void Fastor_D() {

    using namespace Fastor;
    enum {I,J,K,L,M,N};

    fA.ones();
    fB.ones();

    auto fD = einsum<Index<I,J,K>,Index<J,M,I,N>>(fA,fB);

    unused(fA);
    unused(fB);
    unused(fD);
}


void Eigen_C() {

    using namespace Eigen;

    eA.setConstant(1);
    eB.setConstant(1);

    array<IndexPair<int>,3> IJK_JKIM = {
        IndexPair<int>(0, 2),
        IndexPair<int>(1, 0),
        IndexPair<int>(2, 1),
    };

    eC = eA.contract(  eB,  IJK_JKIM) ;

    Fastor::unused(eA);
    Fastor::unused(eB);
    Fastor::unused(eC);
}


void Eigen_D() {
    using namespace Eigen;

    eA.setConstant(1);
    eB.setConstant(1);

     array<IndexPair<int>,2> IJK_JMIN = {
         IndexPair<int>(0, 2),
         IndexPair<int>(1, 0),
     };

    eD = eA.contract(  eB,  IJK_JMIN) ;

    Fastor::unused(eA);
    Fastor::unused(eB);
    Fastor::unused(eD);
}


int main() {
    using namespace Fastor;

    print("Time for computing tensor C (CTran, Fastor, Eigen):");
    timeit(CTran_C);
    timeit(Fastor_C);
    timeit(Eigen_C);
    print("Time for computing tensor D (CTran, Fastor, Eigen):");
    timeit(CTran_D);
    timeit(Fastor_D);
    timeit(Eigen_D);

    return 0;
}

使用 GCC 9 g++-9 -O3 -DNDEBUG -mfma 编译,使用 Eigen-3.3.7 和来自 github 的 Fastor 最新版本,这就是我得到的

Time for computing tensor C (CTran, Fastor, Eigen):
32305 runs, average elapsed time is 30.9556 µs. No of CPU cycles 113604
42325 runs, average elapsed time is 23.6269 µs. No of CPU cycles 86643
2585 runs, average elapsed time is 387.003 µs. No of CPU cycles 1429229
Time for computing tensor D (CTran, Fastor, Eigen):
8086 runs, average elapsed time is 123.674 µs. No of CPU cycles 455798
21246 runs, average elapsed time is 47.0713 µs. No of CPU cycles 173044
5890 runs, average elapsed time is 169.793 µs. No of CPU cycles 626651

请注意,编译器会自动矢量化您的 CTran 代码,因为它是一个带有 for 循环的直接函数,因此生成的代码非常理想。如您所见,Fastor 在这两种情况下的表现都优于其他。

但是,如果您真的要对堆栈分配的数据进行基准测试,那么您应该减小张量的大小,因为这些维度对于堆栈数据来说非常不切实际。如果我将张量的大小重新定义为

constexpr int FIFTY =  5;
constexpr int TWETY =  2;
constexpr int TEN =  8;
constexpr int FOUR =  4;

这是我得到的

Time for computing tensor C (CTran, Fastor, Eigen):
5493029 runs, average elapsed time is 182.049 ns. No of CPU cycles 486
5865156 runs, average elapsed time is 170.498 ns. No of CPU cycles 442
301422 runs, average elapsed time is 3.31761 µs. No of CPU cycles 12032
Time for computing tensor D (CTran, Fastor, Eigen):
207912 runs, average elapsed time is 4.80974 µs. No of CPU cycles 17574
2733602 runs, average elapsed time is 365.818 ns. No of CPU cycles 1164
514351 runs, average elapsed time is 1.94421 µs. No of CPU cycles 6977

为了计算张量 D,编译器错误地编译了这个大小的 CTran 代码。注意微秒 µs 和纳秒 ns.

之间的区别