C++ - Fastor vs. Eigen vs C-tran:张量收缩的速度差异
C++ - Fastor vs. Eigen vs C-tran: Speed differences in tensor contraction
我目前正在寻找一个 C++ 库(最好只有头文件),用于使用爱因斯坦求和表示法进行高阶张量收缩。
Fastor (https://github.com/romeric/Fastor) 似乎非常适合,并且由于 Eigen(我经常使用它)有一个张量模块,我做了一个小的比较,包括一个简单循环实现的基准:
#include <Fastor.h>
#include "unsupported/Eigen/CXX11/Tensor"
#include <ctime>
int main() {
clock_t tstart, tend;
{
using namespace Fastor;
Tensor<double,20,50,10> A;
Tensor<double,50,10,20,4> B;
Tensor<double, 4> C;
Tensor<double, 10, 10, 4> D;
A.ones();
B.ones();
C.zeros();
D.zeros();
enum {I,J,K,L,M,N};
tstart = clock();
C += einsum<Index<I,J,K>,Index<J,K,I,M>>(A,B);
tend = clock();
std::cout << "Fastor, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;
tstart = clock();
D += einsum<Index<I,J,K>,Index<J,M,I,N>>(A,B);
tend = clock();
std::cout << "Fastor, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;
}
{
using namespace Eigen;
TensorFixedSize<double, Sizes<20, 50, 10>> A;
TensorFixedSize<double, Sizes<50,10, 20, 4>> B;
TensorFixedSize<double, Sizes<4>> C;
TensorFixedSize<double, Sizes<10,10,4>> D;
A.setConstant(1);
B.setConstant(1);
C.setConstant(0);
D.setConstant(0);
array<IndexPair<int>,3> IJK_JKIM = {
IndexPair<int>(0, 2),
IndexPair<int>(1, 0),
IndexPair<int>(2, 1),
};
array<IndexPair<int>,2> IJK_JMIN = {
IndexPair<int>(0, 2),
IndexPair<int>(1, 0),
};
tstart = clock();
C += A.contract( B, IJK_JKIM) ;
tend = clock();
std::cout << "Eigen, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;
tstart = clock();
D += A.contract( B, IJK_JMIN) ;
tend = clock();
std::cout << "Eigen, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;
}
{
using namespace Eigen;
double A [20][50][10] = {1} ;
double B [50][10][20][4] = {1};
double C [4] = {};
double D [10][10][4] = {};
for ( int i = 0; i < 20; i++)
for ( int j = 0; j < 50; j++)
for ( int k = 0; k < 10; k++)
A[i][j][k] = 1.0;
for ( int i = 0; i < 20; i++)
for ( int j = 0; j < 50; j++)
for ( int k = 0; k < 10; k++)
for ( int l = 0; l < 4; l++)
B[j][k][i][l] = 1.0;
tstart = clock();
for ( int i = 0; i < 20; i++)
for ( int j = 0; j < 50; j++)
for ( int k = 0; k < 10; k++)
for ( int l = 0; l < 4; l++)
C[l] += A[i][j][k] * B [j][k][i][l];
tend = clock();
std::cout << "CTran, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;
tstart = clock();
for ( int i = 0; i < 20; i++)
for ( int j = 0; j < 50; j++)
for ( int k = 0; k < 10; k++)
for ( int m = 0; m < 10; m++)
for ( int n = 0; n < 4; n++)
D[k][m][n] += A[i][j][k] * B [j][m][i][n];
tend = clock();
std::cout << "CTran, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;
}
return 0;
}
我的输出是:
Fastor, C_M = A_IJK * B_JKIM: 206
Fastor, D_KMN = A_IJ * B_JMIN: 2230
Eigen, C_M = A_IJK * B_JKIM: 1286
Eigen, D_KMN = A_IJ * B_JMIN: 898
CTran, C_M = A_IJK * B_JKIM: 2
CTran, D_KMN = A_IJ * B_JMIN: 2
使用
使用 g++ 9.1.0 (Arch Linux) 编译
g++ test.cpp -O3 -std=c++17 -I Fastor -isystem eigen-eigen-323c052e1731 -o test
因此,对于示例 1,Fastor 似乎比 Eigen 快得多,但对于示例 2 则慢得多。
然而,这两个库都比简单的循环实现慢得令人难以置信!
这些相互矛盾的结果有解释吗?提前致谢!
对于您 "simple loop" 基准测试,GCC 可以查看所有计算并完全删除所有计算。在 Godbolt 上查看互动:https://www.godbolt.org/z/-gPPVH.
您的 "simple loop" 片段的程序集如下:
foo():
push rbp
push rbx
sub rsp, 8
call clock
mov rbp, rax
call clock
mov edx, 29
mov esi, OFFSET FLAT:.LC0
mov edi, OFFSET FLAT:_ZSt4cout
mov rbx, rax
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
sub rbx, rbp
mov edi, OFFSET FLAT:_ZSt4cout
mov rsi, rbx
call std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<long>(long)
mov rbp, rax
mov rax, QWORD PTR [rax]
mov rax, QWORD PTR [rax-24]
mov rbx, QWORD PTR [rbp+240+rax]
test rbx, rbx
je .L7
cmp BYTE PTR [rbx+56], 0
je .L5
movsx esi, BYTE PTR [rbx+67]
.L6:
mov rdi, rbp
call std::basic_ostream<char, std::char_traits<char> >::put(char)
mov rdi, rax
call std::basic_ostream<char, std::char_traits<char> >::flush()
call clock
mov rbp, rax
call clock
mov edx, 30
mov esi, OFFSET FLAT:.LC1
mov edi, OFFSET FLAT:_ZSt4cout
mov rbx, rax
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
mov rsi, rbx
mov edi, OFFSET FLAT:_ZSt4cout
sub rsi, rbp
call std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<long>(long)
mov rbp, rax
mov rax, QWORD PTR [rax]
mov rax, QWORD PTR [rax-24]
mov rbx, QWORD PTR [rbp+240+rax]
test rbx, rbx
je .L7
cmp BYTE PTR [rbx+56], 0
je .L8
movsx esi, BYTE PTR [rbx+67]
.L9:
mov rdi, rbp
call std::basic_ostream<char, std::char_traits<char> >::put(char)
add rsp, 8
pop rbx
mov rdi, rax
pop rbp
jmp std::basic_ostream<char, std::char_traits<char> >::flush()
.L5:
mov rdi, rbx
call std::ctype<char>::_M_widen_init() const
mov rax, QWORD PTR [rbx]
mov esi, 10
mov rax, QWORD PTR [rax+48]
cmp rax, OFFSET FLAT:std::ctype<char>::do_widen(char) const
je .L6
mov rdi, rbx
call rax
movsx esi, al
jmp .L6
.L8:
mov rdi, rbx
call std::ctype<char>::_M_widen_init() const
mov rax, QWORD PTR [rbx]
mov esi, 10
mov rax, QWORD PTR [rax+48]
cmp rax, OFFSET FLAT:std::ctype<char>::do_widen(char) const
je .L9
mov rdi, rbx
call rax
movsx esi, al
jmp .L9
.L7:
call std::__throw_bad_cast()
如您所见,编译器已完全删除所有计算。
最大的影响因素是计算结果没有在任何地方使用。这允许编译器将计算删除为 "dead code".
奇怪的是,Clang 和 MSVC(请参阅上面的 link 到 goldbolt)都没有设法完全删除计算。 Clang 取消了拳头收缩,但没有取消第二个。
这是进行微基准测试时的常见问题。 Google benchmark 等成熟的基准测试框架可以通过将某些变量标记为不可优化的方式来缓解这种情况,迫使编译器实际计算这些变量的值。我强烈建议使用这些框架之一。
您可以使用 Fastor 的 unused
和 timeit
函数进行适当的基准测试,而不必担心编译器是否会消除您的死代码。
我会把你的基准写成如下:
constexpr int FIFTY = 50;
constexpr int TWETY = 20;
constexpr int TEN = 10;
constexpr int FOUR = 4;
using real_ = double;
real_ A [TWETY][FIFTY][TEN] = {1} ;
real_ B [FIFTY][TEN][TWETY][FOUR] = {1};
real_ C [FOUR] = {};
real_ D [TEN][TEN][FOUR] = {};
Fastor::Tensor<real_,TWETY,FIFTY,TEN> fA;
Fastor::Tensor<real_,FIFTY,TEN,TWETY,FOUR> fB;
Eigen::TensorFixedSize<real_, Eigen::Sizes<TWETY, FIFTY, TEN>> eA;
Eigen::TensorFixedSize<real_, Eigen::Sizes<FIFTY,TEN, TWETY, FOUR>> eB;
Eigen::TensorFixedSize<real_, Eigen::Sizes<FOUR>> eC;
Eigen::TensorFixedSize<real_, Eigen::Sizes<TEN,TEN,FOUR>> eD;
void CTran_C() {
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
A[i][j][k] = 1.0;
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
for ( int l = 0; l < FOUR; l++)
B[j][k][i][l] = 1.0;
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
for ( int l = 0; l < FOUR; l++)
C[l] += A[i][j][k] * B[j][k][i][l];
Fastor::unused(A);
Fastor::unused(B);
Fastor::unused(C);
}
void CTran_D() {
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
A[i][j][k] = 1.0;
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
for ( int l = 0; l < FOUR; l++)
B[j][k][i][l] = 1.0;
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
for ( int m = 0; m < TEN; m++)
for ( int n = 0; n < FOUR; n++)
D[k][m][n] += A[i][j][k] * B [j][m][i][n];
Fastor::unused(A);
Fastor::unused(B);
Fastor::unused(D);
}
void Fastor_C() {
using namespace Fastor;
enum {I,J,K,L,M,N};
fA.ones();
fB.ones();
auto fC = einsum<Index<I,J,K>,Index<J,K,I,M>>(fA,fB);
unused(fA);
unused(fB);
unused(fC);
}
void Fastor_D() {
using namespace Fastor;
enum {I,J,K,L,M,N};
fA.ones();
fB.ones();
auto fD = einsum<Index<I,J,K>,Index<J,M,I,N>>(fA,fB);
unused(fA);
unused(fB);
unused(fD);
}
void Eigen_C() {
using namespace Eigen;
eA.setConstant(1);
eB.setConstant(1);
array<IndexPair<int>,3> IJK_JKIM = {
IndexPair<int>(0, 2),
IndexPair<int>(1, 0),
IndexPair<int>(2, 1),
};
eC = eA.contract( eB, IJK_JKIM) ;
Fastor::unused(eA);
Fastor::unused(eB);
Fastor::unused(eC);
}
void Eigen_D() {
using namespace Eigen;
eA.setConstant(1);
eB.setConstant(1);
array<IndexPair<int>,2> IJK_JMIN = {
IndexPair<int>(0, 2),
IndexPair<int>(1, 0),
};
eD = eA.contract( eB, IJK_JMIN) ;
Fastor::unused(eA);
Fastor::unused(eB);
Fastor::unused(eD);
}
int main() {
using namespace Fastor;
print("Time for computing tensor C (CTran, Fastor, Eigen):");
timeit(CTran_C);
timeit(Fastor_C);
timeit(Eigen_C);
print("Time for computing tensor D (CTran, Fastor, Eigen):");
timeit(CTran_D);
timeit(Fastor_D);
timeit(Eigen_D);
return 0;
}
使用 GCC 9 g++-9 -O3 -DNDEBUG -mfma
编译,使用 Eigen-3.3.7 和来自 github 的 Fastor 最新版本,这就是我得到的
Time for computing tensor C (CTran, Fastor, Eigen):
32305 runs, average elapsed time is 30.9556 µs. No of CPU cycles 113604
42325 runs, average elapsed time is 23.6269 µs. No of CPU cycles 86643
2585 runs, average elapsed time is 387.003 µs. No of CPU cycles 1429229
Time for computing tensor D (CTran, Fastor, Eigen):
8086 runs, average elapsed time is 123.674 µs. No of CPU cycles 455798
21246 runs, average elapsed time is 47.0713 µs. No of CPU cycles 173044
5890 runs, average elapsed time is 169.793 µs. No of CPU cycles 626651
请注意,编译器会自动矢量化您的 CTran 代码,因为它是一个带有 for 循环的直接函数,因此生成的代码非常理想。如您所见,Fastor 在这两种情况下的表现都优于其他。
但是,如果您真的要对堆栈分配的数据进行基准测试,那么您应该减小张量的大小,因为这些维度对于堆栈数据来说非常不切实际。如果我将张量的大小重新定义为
constexpr int FIFTY = 5;
constexpr int TWETY = 2;
constexpr int TEN = 8;
constexpr int FOUR = 4;
这是我得到的
Time for computing tensor C (CTran, Fastor, Eigen):
5493029 runs, average elapsed time is 182.049 ns. No of CPU cycles 486
5865156 runs, average elapsed time is 170.498 ns. No of CPU cycles 442
301422 runs, average elapsed time is 3.31761 µs. No of CPU cycles 12032
Time for computing tensor D (CTran, Fastor, Eigen):
207912 runs, average elapsed time is 4.80974 µs. No of CPU cycles 17574
2733602 runs, average elapsed time is 365.818 ns. No of CPU cycles 1164
514351 runs, average elapsed time is 1.94421 µs. No of CPU cycles 6977
为了计算张量 D
,编译器错误地编译了这个大小的 CTran 代码。注意微秒 µs
和纳秒 ns
.
之间的区别
我目前正在寻找一个 C++ 库(最好只有头文件),用于使用爱因斯坦求和表示法进行高阶张量收缩。
Fastor (https://github.com/romeric/Fastor) 似乎非常适合,并且由于 Eigen(我经常使用它)有一个张量模块,我做了一个小的比较,包括一个简单循环实现的基准:
#include <Fastor.h>
#include "unsupported/Eigen/CXX11/Tensor"
#include <ctime>
int main() {
clock_t tstart, tend;
{
using namespace Fastor;
Tensor<double,20,50,10> A;
Tensor<double,50,10,20,4> B;
Tensor<double, 4> C;
Tensor<double, 10, 10, 4> D;
A.ones();
B.ones();
C.zeros();
D.zeros();
enum {I,J,K,L,M,N};
tstart = clock();
C += einsum<Index<I,J,K>,Index<J,K,I,M>>(A,B);
tend = clock();
std::cout << "Fastor, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;
tstart = clock();
D += einsum<Index<I,J,K>,Index<J,M,I,N>>(A,B);
tend = clock();
std::cout << "Fastor, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;
}
{
using namespace Eigen;
TensorFixedSize<double, Sizes<20, 50, 10>> A;
TensorFixedSize<double, Sizes<50,10, 20, 4>> B;
TensorFixedSize<double, Sizes<4>> C;
TensorFixedSize<double, Sizes<10,10,4>> D;
A.setConstant(1);
B.setConstant(1);
C.setConstant(0);
D.setConstant(0);
array<IndexPair<int>,3> IJK_JKIM = {
IndexPair<int>(0, 2),
IndexPair<int>(1, 0),
IndexPair<int>(2, 1),
};
array<IndexPair<int>,2> IJK_JMIN = {
IndexPair<int>(0, 2),
IndexPair<int>(1, 0),
};
tstart = clock();
C += A.contract( B, IJK_JKIM) ;
tend = clock();
std::cout << "Eigen, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;
tstart = clock();
D += A.contract( B, IJK_JMIN) ;
tend = clock();
std::cout << "Eigen, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;
}
{
using namespace Eigen;
double A [20][50][10] = {1} ;
double B [50][10][20][4] = {1};
double C [4] = {};
double D [10][10][4] = {};
for ( int i = 0; i < 20; i++)
for ( int j = 0; j < 50; j++)
for ( int k = 0; k < 10; k++)
A[i][j][k] = 1.0;
for ( int i = 0; i < 20; i++)
for ( int j = 0; j < 50; j++)
for ( int k = 0; k < 10; k++)
for ( int l = 0; l < 4; l++)
B[j][k][i][l] = 1.0;
tstart = clock();
for ( int i = 0; i < 20; i++)
for ( int j = 0; j < 50; j++)
for ( int k = 0; k < 10; k++)
for ( int l = 0; l < 4; l++)
C[l] += A[i][j][k] * B [j][k][i][l];
tend = clock();
std::cout << "CTran, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;
tstart = clock();
for ( int i = 0; i < 20; i++)
for ( int j = 0; j < 50; j++)
for ( int k = 0; k < 10; k++)
for ( int m = 0; m < 10; m++)
for ( int n = 0; n < 4; n++)
D[k][m][n] += A[i][j][k] * B [j][m][i][n];
tend = clock();
std::cout << "CTran, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;
}
return 0;
}
我的输出是:
Fastor, C_M = A_IJK * B_JKIM: 206
Fastor, D_KMN = A_IJ * B_JMIN: 2230
Eigen, C_M = A_IJK * B_JKIM: 1286
Eigen, D_KMN = A_IJ * B_JMIN: 898
CTran, C_M = A_IJK * B_JKIM: 2
CTran, D_KMN = A_IJ * B_JMIN: 2
使用
使用 g++ 9.1.0 (Arch Linux) 编译g++ test.cpp -O3 -std=c++17 -I Fastor -isystem eigen-eigen-323c052e1731 -o test
因此,对于示例 1,Fastor 似乎比 Eigen 快得多,但对于示例 2 则慢得多。 然而,这两个库都比简单的循环实现慢得令人难以置信! 这些相互矛盾的结果有解释吗?提前致谢!
对于您 "simple loop" 基准测试,GCC 可以查看所有计算并完全删除所有计算。在 Godbolt 上查看互动:https://www.godbolt.org/z/-gPPVH.
您的 "simple loop" 片段的程序集如下:
foo():
push rbp
push rbx
sub rsp, 8
call clock
mov rbp, rax
call clock
mov edx, 29
mov esi, OFFSET FLAT:.LC0
mov edi, OFFSET FLAT:_ZSt4cout
mov rbx, rax
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
sub rbx, rbp
mov edi, OFFSET FLAT:_ZSt4cout
mov rsi, rbx
call std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<long>(long)
mov rbp, rax
mov rax, QWORD PTR [rax]
mov rax, QWORD PTR [rax-24]
mov rbx, QWORD PTR [rbp+240+rax]
test rbx, rbx
je .L7
cmp BYTE PTR [rbx+56], 0
je .L5
movsx esi, BYTE PTR [rbx+67]
.L6:
mov rdi, rbp
call std::basic_ostream<char, std::char_traits<char> >::put(char)
mov rdi, rax
call std::basic_ostream<char, std::char_traits<char> >::flush()
call clock
mov rbp, rax
call clock
mov edx, 30
mov esi, OFFSET FLAT:.LC1
mov edi, OFFSET FLAT:_ZSt4cout
mov rbx, rax
call std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
mov rsi, rbx
mov edi, OFFSET FLAT:_ZSt4cout
sub rsi, rbp
call std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<long>(long)
mov rbp, rax
mov rax, QWORD PTR [rax]
mov rax, QWORD PTR [rax-24]
mov rbx, QWORD PTR [rbp+240+rax]
test rbx, rbx
je .L7
cmp BYTE PTR [rbx+56], 0
je .L8
movsx esi, BYTE PTR [rbx+67]
.L9:
mov rdi, rbp
call std::basic_ostream<char, std::char_traits<char> >::put(char)
add rsp, 8
pop rbx
mov rdi, rax
pop rbp
jmp std::basic_ostream<char, std::char_traits<char> >::flush()
.L5:
mov rdi, rbx
call std::ctype<char>::_M_widen_init() const
mov rax, QWORD PTR [rbx]
mov esi, 10
mov rax, QWORD PTR [rax+48]
cmp rax, OFFSET FLAT:std::ctype<char>::do_widen(char) const
je .L6
mov rdi, rbx
call rax
movsx esi, al
jmp .L6
.L8:
mov rdi, rbx
call std::ctype<char>::_M_widen_init() const
mov rax, QWORD PTR [rbx]
mov esi, 10
mov rax, QWORD PTR [rax+48]
cmp rax, OFFSET FLAT:std::ctype<char>::do_widen(char) const
je .L9
mov rdi, rbx
call rax
movsx esi, al
jmp .L9
.L7:
call std::__throw_bad_cast()
如您所见,编译器已完全删除所有计算。
最大的影响因素是计算结果没有在任何地方使用。这允许编译器将计算删除为 "dead code".
奇怪的是,Clang 和 MSVC(请参阅上面的 link 到 goldbolt)都没有设法完全删除计算。 Clang 取消了拳头收缩,但没有取消第二个。
这是进行微基准测试时的常见问题。 Google benchmark 等成熟的基准测试框架可以通过将某些变量标记为不可优化的方式来缓解这种情况,迫使编译器实际计算这些变量的值。我强烈建议使用这些框架之一。
您可以使用 Fastor 的 unused
和 timeit
函数进行适当的基准测试,而不必担心编译器是否会消除您的死代码。
我会把你的基准写成如下:
constexpr int FIFTY = 50;
constexpr int TWETY = 20;
constexpr int TEN = 10;
constexpr int FOUR = 4;
using real_ = double;
real_ A [TWETY][FIFTY][TEN] = {1} ;
real_ B [FIFTY][TEN][TWETY][FOUR] = {1};
real_ C [FOUR] = {};
real_ D [TEN][TEN][FOUR] = {};
Fastor::Tensor<real_,TWETY,FIFTY,TEN> fA;
Fastor::Tensor<real_,FIFTY,TEN,TWETY,FOUR> fB;
Eigen::TensorFixedSize<real_, Eigen::Sizes<TWETY, FIFTY, TEN>> eA;
Eigen::TensorFixedSize<real_, Eigen::Sizes<FIFTY,TEN, TWETY, FOUR>> eB;
Eigen::TensorFixedSize<real_, Eigen::Sizes<FOUR>> eC;
Eigen::TensorFixedSize<real_, Eigen::Sizes<TEN,TEN,FOUR>> eD;
void CTran_C() {
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
A[i][j][k] = 1.0;
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
for ( int l = 0; l < FOUR; l++)
B[j][k][i][l] = 1.0;
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
for ( int l = 0; l < FOUR; l++)
C[l] += A[i][j][k] * B[j][k][i][l];
Fastor::unused(A);
Fastor::unused(B);
Fastor::unused(C);
}
void CTran_D() {
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
A[i][j][k] = 1.0;
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
for ( int l = 0; l < FOUR; l++)
B[j][k][i][l] = 1.0;
for ( int i = 0; i < TWETY; i++)
for ( int j = 0; j < FIFTY; j++)
for ( int k = 0; k < TEN; k++)
for ( int m = 0; m < TEN; m++)
for ( int n = 0; n < FOUR; n++)
D[k][m][n] += A[i][j][k] * B [j][m][i][n];
Fastor::unused(A);
Fastor::unused(B);
Fastor::unused(D);
}
void Fastor_C() {
using namespace Fastor;
enum {I,J,K,L,M,N};
fA.ones();
fB.ones();
auto fC = einsum<Index<I,J,K>,Index<J,K,I,M>>(fA,fB);
unused(fA);
unused(fB);
unused(fC);
}
void Fastor_D() {
using namespace Fastor;
enum {I,J,K,L,M,N};
fA.ones();
fB.ones();
auto fD = einsum<Index<I,J,K>,Index<J,M,I,N>>(fA,fB);
unused(fA);
unused(fB);
unused(fD);
}
void Eigen_C() {
using namespace Eigen;
eA.setConstant(1);
eB.setConstant(1);
array<IndexPair<int>,3> IJK_JKIM = {
IndexPair<int>(0, 2),
IndexPair<int>(1, 0),
IndexPair<int>(2, 1),
};
eC = eA.contract( eB, IJK_JKIM) ;
Fastor::unused(eA);
Fastor::unused(eB);
Fastor::unused(eC);
}
void Eigen_D() {
using namespace Eigen;
eA.setConstant(1);
eB.setConstant(1);
array<IndexPair<int>,2> IJK_JMIN = {
IndexPair<int>(0, 2),
IndexPair<int>(1, 0),
};
eD = eA.contract( eB, IJK_JMIN) ;
Fastor::unused(eA);
Fastor::unused(eB);
Fastor::unused(eD);
}
int main() {
using namespace Fastor;
print("Time for computing tensor C (CTran, Fastor, Eigen):");
timeit(CTran_C);
timeit(Fastor_C);
timeit(Eigen_C);
print("Time for computing tensor D (CTran, Fastor, Eigen):");
timeit(CTran_D);
timeit(Fastor_D);
timeit(Eigen_D);
return 0;
}
使用 GCC 9 g++-9 -O3 -DNDEBUG -mfma
编译,使用 Eigen-3.3.7 和来自 github 的 Fastor 最新版本,这就是我得到的
Time for computing tensor C (CTran, Fastor, Eigen):
32305 runs, average elapsed time is 30.9556 µs. No of CPU cycles 113604
42325 runs, average elapsed time is 23.6269 µs. No of CPU cycles 86643
2585 runs, average elapsed time is 387.003 µs. No of CPU cycles 1429229
Time for computing tensor D (CTran, Fastor, Eigen):
8086 runs, average elapsed time is 123.674 µs. No of CPU cycles 455798
21246 runs, average elapsed time is 47.0713 µs. No of CPU cycles 173044
5890 runs, average elapsed time is 169.793 µs. No of CPU cycles 626651
请注意,编译器会自动矢量化您的 CTran 代码,因为它是一个带有 for 循环的直接函数,因此生成的代码非常理想。如您所见,Fastor 在这两种情况下的表现都优于其他。
但是,如果您真的要对堆栈分配的数据进行基准测试,那么您应该减小张量的大小,因为这些维度对于堆栈数据来说非常不切实际。如果我将张量的大小重新定义为
constexpr int FIFTY = 5;
constexpr int TWETY = 2;
constexpr int TEN = 8;
constexpr int FOUR = 4;
这是我得到的
Time for computing tensor C (CTran, Fastor, Eigen):
5493029 runs, average elapsed time is 182.049 ns. No of CPU cycles 486
5865156 runs, average elapsed time is 170.498 ns. No of CPU cycles 442
301422 runs, average elapsed time is 3.31761 µs. No of CPU cycles 12032
Time for computing tensor D (CTran, Fastor, Eigen):
207912 runs, average elapsed time is 4.80974 µs. No of CPU cycles 17574
2733602 runs, average elapsed time is 365.818 ns. No of CPU cycles 1164
514351 runs, average elapsed time is 1.94421 µs. No of CPU cycles 6977
为了计算张量 D
,编译器错误地编译了这个大小的 CTran 代码。注意微秒 µs
和纳秒 ns
.