使 C++ Eigen LU 更快(我的测试显示比 GSL 慢 2 倍)
Making C++ Eigen LU faster (my tests show 2x slower than GSL)
我正在将 Eigen 的 LU decomposition/solve 与 GSL 进行比较,发现在 g++/OSX 上使用 -O3 优化,Eigen 的速度大约慢 2 倍。我分离了分解和求解的时间,但发现两者都比它们的 GSL 对应物慢得多。我是在做一些愚蠢的事情,还是 Eigen 在这个用例中表现不佳(例如非常小的系统?)使用 Eigen 3.2.8 和较旧的 GSL 1.15 构建。测试用例非常人为设计,但反映了我正在编写的一些非线性拟合软件的结果——本征在任何地方都比总 LU/solve 操作慢 1.5 到 2 倍。
#define NDEBUG
#include "sys/time.h"
#include "gsl/gsl_linalg.h"
#include <Eigen/LU>
// Ax=b is a 3x3 system for which soln is x=[8,2,3]
//
double avals_col[9] = { 4, 2, 2, 7, 5, 5, 7, 5, 9 };
// col major
double avals_row[9] = { 4, 7, 7, 2, 5, 5, 2, 5, 9 };
// row major
double bvals[9] = { 67, 41, 53 };
//----------- helpers
void print_solution( double *x, int dim, char *which ) {
printf( "%s solve for x:\n", which );
for( int j=0; j<3; j++ ) {
printf( "%g ", x[j] );
}
printf( "\n" );
}
struct timeval tv;
struct timezone tz;
double timeNow() {
gettimeofday( &tv, &tz );
int _mils = tv.tv_usec/1000;
int _secs = tv.tv_sec;
return (double)_secs + ((double)_mils/1000.0);
}
//-----------
void run_gsl( double *A, double *b, double *x, int dim, int reps ) {
gsl_matrix_view gslA;
gsl_vector_view gslB;
gsl_vector_view gslX;
gsl_permutation *gslP;
int sign;
gslA = gsl_matrix_view_array( A, dim, dim );
gslP = gsl_permutation_alloc( dim );
gslB = gsl_vector_view_array( b, dim );
gslX = gsl_vector_view_array( x, dim );
int err;
double t, elapsed;
t = timeNow();
for( int i=0; i<reps; i++ ) {
// gsl overwrites A during decompose, so we must copy the initial A each time.
memcpy( A, avals_row, sizeof(avals_row) );
err = gsl_linalg_LU_decomp( &gslA.matrix, gslP, &sign );
}
elapsed = timeNow() - t;
printf( "GSL decompose (%dx) time = %g\n", reps, elapsed );
t = timeNow();
for( int i=0; i<reps; i++ ) {
err = gsl_linalg_LU_solve( &gslA.matrix, gslP, &gslB.vector, &gslX.vector );
}
elapsed = timeNow() - t;
printf( "GSL solve (%dx) time = %g\n", reps, elapsed );
gsl_permutation_free( gslP );
}
void run_eigen( double *A, double *b, double *x, int dim, int reps ) {
Eigen::PartialPivLU<Eigen::MatrixXd> eigenA_lu;
Eigen::Map< Eigen::Matrix < double, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor > > ma( A, dim, dim );
Eigen::Map<Eigen::MatrixXd> mb( b, dim, 1 );
int err;
double t, elapsed;
t = timeNow();
for( int i=0; i<reps; i++ ) {
// This memcpy is not necessary for Eigen, which does not overwrite A in the
// decompose, but do it so that the time is directly comparable to GSL.
memcpy( A, avals_col, sizeof(avals_col) );
eigenA_lu.compute( ma );
}
elapsed = timeNow() - t;
printf( "Eigen decompose (%dx) time = %g\n", reps, elapsed );
t = timeNow();
Eigen::VectorXd _x;
for( int i=0; i<reps; i++ ) {
_x = eigenA_lu.solve( mb );
}
elapsed = timeNow() - t;
printf( "Eigen solve (%dx) time = %g\n", reps, elapsed );
// copy soln to passed x
for( int i=0; i<dim; i++ ) {
x[i] = _x(i);
}
}
int main() {
// solve a 3x3 system many times
double A[9], b[3], x[3];
int dim = 3;
int reps = 1000000;
memcpy( b, bvals, sizeof(bvals) );
// init b vector, A is copied multiple times in run_gsl/run_eigen
run_eigen( A, b, x, dim, reps );
print_solution( x, dim, "Eigen" );
run_gsl( A, b, x, dim, reps );
print_solution( x, dim, "GSL" );
return 0;
}
这会产生,例如:
Eigen decompose (1000000x) time = 0.242
Eigen solve (1000000x) time = 0.108
Eigen solve for x:
8 2 3
GSL decompose (1000000x) time = 0.049
GSL solve (1000000x) time = 0.075
GSL solve for x:
8 2 3
您的基准测试并不公平,因为您在 Eigen 版本中复制了两次输入矩阵:一次通过 memcpy
手动复制,一次在 PartialPivLU
中进行。您还通过将其声明为 Map<Eigen::Vectord>
让 Eigen 知道 mb 是一个向量。然后我得到 (GCC5,-O3,Eigen3.3):
Eigen decompose (1000000x) time = 0.087
Eigen solve (1000000x) time = 0.036
Eigen solve for x:
8 2 3
GSL decompose (1000000x) time = 0.032
GSL solve (1000000x) time = 0.062
GSL solve for x:
8 2 3
此外,Eigen 的 PartialPivLU 并不是真正为这种极小的矩阵设计的(见下文)。对于 3x3 矩阵,最好显式计算逆(对于最大 4x4 的矩阵,通常可以,但对于更大的矩阵则不然!)。在这种情况下,您必须在编译时修复大小:
Eigen::PartialPivLU<Eigen::Matrix3d> eigenA_lu;
Eigen::Map<Eigen::Matrix3d> ma(avals_col);
Eigen::Map<Eigen::Vector3d> mb(b);
Eigen::Matrix3d inv;
Eigen::Vector3d _x;
double t, elapsed;
t = timeNow();
for( int i=0; i<reps; i++ ) {
inv = ma.inverse();
}
elapsed = timeNow() - t;
printf( "Eigen decompose (%dx) time = %g\n", reps, elapsed );
t = timeNow();
for( int i=0; i<reps; i++ ) {
_x.noalias() = inv * mb;
}
elapsed = timeNow() - t;
printf( "Eigen solve (%dx) time = %g\n", reps, elapsed );
这给了我:
Eigen inverse and solve (1000000x) time = 0.0209999
Eigen solve (1000000x) time = 0.000999928
快多了。
现在,如果我们尝试更大的问题,例如 3000 x 3000,我们会得到超过一个数量级的差异,有利于 Eigen:
Eigen decompose (1x) time = 0.411
GSL decompose (1x) time = 6.073
这通常是针对大型问题允许这种性能的优化,同时也会为非常小的矩阵引入一些开销。
我正在将 Eigen 的 LU decomposition/solve 与 GSL 进行比较,发现在 g++/OSX 上使用 -O3 优化,Eigen 的速度大约慢 2 倍。我分离了分解和求解的时间,但发现两者都比它们的 GSL 对应物慢得多。我是在做一些愚蠢的事情,还是 Eigen 在这个用例中表现不佳(例如非常小的系统?)使用 Eigen 3.2.8 和较旧的 GSL 1.15 构建。测试用例非常人为设计,但反映了我正在编写的一些非线性拟合软件的结果——本征在任何地方都比总 LU/solve 操作慢 1.5 到 2 倍。
#define NDEBUG
#include "sys/time.h"
#include "gsl/gsl_linalg.h"
#include <Eigen/LU>
// Ax=b is a 3x3 system for which soln is x=[8,2,3]
//
double avals_col[9] = { 4, 2, 2, 7, 5, 5, 7, 5, 9 };
// col major
double avals_row[9] = { 4, 7, 7, 2, 5, 5, 2, 5, 9 };
// row major
double bvals[9] = { 67, 41, 53 };
//----------- helpers
void print_solution( double *x, int dim, char *which ) {
printf( "%s solve for x:\n", which );
for( int j=0; j<3; j++ ) {
printf( "%g ", x[j] );
}
printf( "\n" );
}
struct timeval tv;
struct timezone tz;
double timeNow() {
gettimeofday( &tv, &tz );
int _mils = tv.tv_usec/1000;
int _secs = tv.tv_sec;
return (double)_secs + ((double)_mils/1000.0);
}
//-----------
void run_gsl( double *A, double *b, double *x, int dim, int reps ) {
gsl_matrix_view gslA;
gsl_vector_view gslB;
gsl_vector_view gslX;
gsl_permutation *gslP;
int sign;
gslA = gsl_matrix_view_array( A, dim, dim );
gslP = gsl_permutation_alloc( dim );
gslB = gsl_vector_view_array( b, dim );
gslX = gsl_vector_view_array( x, dim );
int err;
double t, elapsed;
t = timeNow();
for( int i=0; i<reps; i++ ) {
// gsl overwrites A during decompose, so we must copy the initial A each time.
memcpy( A, avals_row, sizeof(avals_row) );
err = gsl_linalg_LU_decomp( &gslA.matrix, gslP, &sign );
}
elapsed = timeNow() - t;
printf( "GSL decompose (%dx) time = %g\n", reps, elapsed );
t = timeNow();
for( int i=0; i<reps; i++ ) {
err = gsl_linalg_LU_solve( &gslA.matrix, gslP, &gslB.vector, &gslX.vector );
}
elapsed = timeNow() - t;
printf( "GSL solve (%dx) time = %g\n", reps, elapsed );
gsl_permutation_free( gslP );
}
void run_eigen( double *A, double *b, double *x, int dim, int reps ) {
Eigen::PartialPivLU<Eigen::MatrixXd> eigenA_lu;
Eigen::Map< Eigen::Matrix < double, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor > > ma( A, dim, dim );
Eigen::Map<Eigen::MatrixXd> mb( b, dim, 1 );
int err;
double t, elapsed;
t = timeNow();
for( int i=0; i<reps; i++ ) {
// This memcpy is not necessary for Eigen, which does not overwrite A in the
// decompose, but do it so that the time is directly comparable to GSL.
memcpy( A, avals_col, sizeof(avals_col) );
eigenA_lu.compute( ma );
}
elapsed = timeNow() - t;
printf( "Eigen decompose (%dx) time = %g\n", reps, elapsed );
t = timeNow();
Eigen::VectorXd _x;
for( int i=0; i<reps; i++ ) {
_x = eigenA_lu.solve( mb );
}
elapsed = timeNow() - t;
printf( "Eigen solve (%dx) time = %g\n", reps, elapsed );
// copy soln to passed x
for( int i=0; i<dim; i++ ) {
x[i] = _x(i);
}
}
int main() {
// solve a 3x3 system many times
double A[9], b[3], x[3];
int dim = 3;
int reps = 1000000;
memcpy( b, bvals, sizeof(bvals) );
// init b vector, A is copied multiple times in run_gsl/run_eigen
run_eigen( A, b, x, dim, reps );
print_solution( x, dim, "Eigen" );
run_gsl( A, b, x, dim, reps );
print_solution( x, dim, "GSL" );
return 0;
}
这会产生,例如:
Eigen decompose (1000000x) time = 0.242
Eigen solve (1000000x) time = 0.108
Eigen solve for x:
8 2 3
GSL decompose (1000000x) time = 0.049
GSL solve (1000000x) time = 0.075
GSL solve for x:
8 2 3
您的基准测试并不公平,因为您在 Eigen 版本中复制了两次输入矩阵:一次通过 memcpy
手动复制,一次在 PartialPivLU
中进行。您还通过将其声明为 Map<Eigen::Vectord>
让 Eigen 知道 mb 是一个向量。然后我得到 (GCC5,-O3,Eigen3.3):
Eigen decompose (1000000x) time = 0.087
Eigen solve (1000000x) time = 0.036
Eigen solve for x:
8 2 3
GSL decompose (1000000x) time = 0.032
GSL solve (1000000x) time = 0.062
GSL solve for x:
8 2 3
此外,Eigen 的 PartialPivLU 并不是真正为这种极小的矩阵设计的(见下文)。对于 3x3 矩阵,最好显式计算逆(对于最大 4x4 的矩阵,通常可以,但对于更大的矩阵则不然!)。在这种情况下,您必须在编译时修复大小:
Eigen::PartialPivLU<Eigen::Matrix3d> eigenA_lu;
Eigen::Map<Eigen::Matrix3d> ma(avals_col);
Eigen::Map<Eigen::Vector3d> mb(b);
Eigen::Matrix3d inv;
Eigen::Vector3d _x;
double t, elapsed;
t = timeNow();
for( int i=0; i<reps; i++ ) {
inv = ma.inverse();
}
elapsed = timeNow() - t;
printf( "Eigen decompose (%dx) time = %g\n", reps, elapsed );
t = timeNow();
for( int i=0; i<reps; i++ ) {
_x.noalias() = inv * mb;
}
elapsed = timeNow() - t;
printf( "Eigen solve (%dx) time = %g\n", reps, elapsed );
这给了我:
Eigen inverse and solve (1000000x) time = 0.0209999
Eigen solve (1000000x) time = 0.000999928
快多了。
现在,如果我们尝试更大的问题,例如 3000 x 3000,我们会得到超过一个数量级的差异,有利于 Eigen:
Eigen decompose (1x) time = 0.411
GSL decompose (1x) time = 6.073
这通常是针对大型问题允许这种性能的优化,同时也会为非常小的矩阵引入一些开销。