如何写出可以和Eigen抗衡的matrix矩阵乘积?
How to write a matrix matrix product that can compete with Eigen?
下面是 C++ 实现比较 Eigen 和 For Loop 执行矩阵-矩阵乘积所花费的时间。 For 循环已经过优化以最大限度地减少缓存未命中。 for 循环最初比 Eigen 快,但最终变得更慢(对于 500 x 500 矩阵高达 2 倍)。我还应该怎么做才能与 Eigen 竞争?阻塞是更好的 Eigen 性能的原因吗?如果是这样,我应该如何为 for 循环添加阻塞?
#include<iostream>
#include<Eigen/Dense>
#include<ctime>
int main(int argc, char* argv[]) {
srand(time(NULL));
// Input the size of the matrix from the user
int N = atoi(argv[1]);
int M = N*N;
// The matrices stored as row-wise vectors
double a[M];
double b[M];
double c[M];
// Initializing Eigen Matrices
Eigen::MatrixXd a_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd b_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd c_E(N,N);
double CPS = CLOCKS_PER_SEC;
clock_t start, end;
// Matrix vector product by Eigen
start = clock();
c_E = a_E*b_E;
end = clock();
std::cout << "\nTime taken by Eigen is: " << (end-start)/CPS << "\n";
// Initializing For loop vectors
int count = 0;
for (int j=0; j<N; ++j) {
for (int k=0; k<N; ++k) {
a[count] = a_E(j,k);
b[count] = b_E(j,k);
++count;
}
}
// Matrix vector product by For loop
start = clock();
count = 0;
int count1, count2;
for (int j=0; j<N; ++j) {
count1 = j*N;
for (int k=0; k<N; ++k) {
c[count] = a[count1]*b[k];
++count;
}
}
for (int j=0; j<N; ++j) {
count2 = N;
for (int l=1; l<N; ++l) {
count = j*N;
count1 = count+l;
for (int k=0; k<N; ++k) {
c[count]+=a[count1]*b[count2];
++count;
++count2;
}
}
}
end = clock();
std::cout << "\nTime taken by for-loop is: " << (end-start)/CPS << "\n";
}
我可能会建议两个简单的优化。
1) 对其进行矢量化。如果您使用内联汇编对其进行矢量化或编写汇编过程会更好,但您也可以使用编译器内部函数。您甚至可以让编译器对循环进行矢量化,但有时很难编写适当的循环以由编译器进行矢量化。
2) 使其平行。尝试使用 OpenMP。
编译器已经很好地矢量化了您的代码。更高性能的关键是分层阻塞,以优化寄存器和不同级别缓存的使用。部分循环展开对于改进指令流水线也至关重要。达到 Eigen 产品的性能需要大量的努力和调整。
还应注意,您的基准测试略有偏差且不完全可靠。这是一个更可靠的版本(你需要完整的 Eigen 源代码才能获得 bench/BenchTimer.h
):
#include<iostream>
#include<Eigen/Dense>
#include<bench/BenchTimer.h>
void myprod(double *c, const double* a, const double* b, int N) {
int count = 0;
int count1, count2;
for (int j=0; j<N; ++j) {
count1 = j*N;
for (int k=0; k<N; ++k) {
c[count] = a[count1]*b[k];
++count;
}
}
for (int j=0; j<N; ++j) {
count2 = N;
for (int l=1; l<N; ++l) {
count = j*N;
count1 = count+l;
for (int k=0; k<N; ++k) {
c[count]+=a[count1]*b[count2];
++count;
++count2;
}
}
}
}
int main(int argc, char* argv[]) {
int N = atoi(argv[1]);
int tries = 4;
int rep = std::max<int>(1,10000000/N/N/N);
Eigen::MatrixXd a_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd b_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd c_E(N,N);
Eigen::BenchTimer t1, t2;
BENCH(t1, tries, rep, c_E.noalias() = a_E*b_E );
BENCH(t2, tries, rep, myprod(c_E.data(), a_E.data(), b_E.data(), N));
std::cout << "\nTime taken by Eigen is: " << t1.best() << "\n";
std::cout << "\nTime taken by for-loop is: " << t2.best() << "\n";
}
编译3.3-beta1
并启用FMA(-mfma
),然后差距变得更大,几乎是N=2000
的一个数量级。
无需费解如何实现矩阵-矩阵乘积的高性能实现。事实上,我们需要更多的人了解它,以应对未来高性能计算的挑战。为了进入这个主题,阅读 BLIS: A Framework for Rapidly Instantiating BLAS Functionality 是一个很好的起点。
所以为了揭秘和回答问题(How to write a matrix matrix product that can competited with Eigen)我把ggael贴的代码加长到总共400行。我刚刚在 AVX 机器(Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz)上对其进行了测试。这里有一些结果:
g++-5.3 -O3 -DNDEBUG -std=c++11 -mavx -m64 -I ../eigen.3.2.8/ gemm.cc -lrt
lehn@heim:~/work/test_eigen$ ./a.out 500
Time taken by Eigen is: 0.0190425
Time taken by for-loop is: 0.0121688
lehn@heim:~/work/test_eigen$ ./a.out 1000
Time taken by Eigen is: 0.147991
Time taken by for-loop is: 0.0959097
lehn@heim:~/work/test_eigen$ ./a.out 1500
Time taken by Eigen is: 0.492858
Time taken by for-loop is: 0.322442
lehn@heim:~/work/test_eigen$ ./a.out 5000
Time taken by Eigen is: 18.3666
Time taken by for-loop is: 12.1023
如果你有 FMA,你可以编译
g++-5.3 -O3 -DNDEBUG -std=c++11 -mfma -m64 -I ../eigen.3.2.8/ -DHAVE_FMA gemm.cc -lrt
如果你还想用 openMP 多线程也用 -fopenmp
编译
这里是基于BLIS论文思路的完整代码。它是独立的,除了它需要完整的 Eigen 源文件,正如 ggael 已经指出的那样:
#include<iostream>
#include<Eigen/Dense>
#include<bench/BenchTimer.h>
#if defined(_OPENMP)
#include <omp.h>
#endif
//-- malloc with alignment --------------------------------------------------------
void *
malloc_(std::size_t alignment, std::size_t size)
{
alignment = std::max(alignment, alignof(void *));
size += alignment;
void *ptr = std::malloc(size);
void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1));
void **vp = (void**) ptr2 - 1;
*vp = ptr;
return ptr2;
}
void
free_(void *ptr)
{
std::free(*((void**)ptr-1));
}
//-- Config --------------------------------------------------------------------
// SIMD-Register width in bits
// SSE: 128
// AVX/FMA: 256
// AVX-512: 512
#ifndef SIMD_REGISTER_WIDTH
#define SIMD_REGISTER_WIDTH 256
#endif
#ifdef HAVE_FMA
# ifndef BS_D_MR
# define BS_D_MR 4
# endif
# ifndef BS_D_NR
# define BS_D_NR 12
# endif
# ifndef BS_D_MC
# define BS_D_MC 256
# endif
# ifndef BS_D_KC
# define BS_D_KC 512
# endif
# ifndef BS_D_NC
# define BS_D_NC 4092
# endif
#endif
#ifndef BS_D_MR
#define BS_D_MR 4
#endif
#ifndef BS_D_NR
#define BS_D_NR 8
#endif
#ifndef BS_D_MC
#define BS_D_MC 256
#endif
#ifndef BS_D_KC
#define BS_D_KC 256
#endif
#ifndef BS_D_NC
#define BS_D_NC 4096
#endif
template <typename T>
struct BlockSize
{
static constexpr int MC = 64;
static constexpr int KC = 64;
static constexpr int NC = 256;
static constexpr int MR = 8;
static constexpr int NR = 8;
static constexpr int rwidth = 0;
static constexpr int align = alignof(T);
static constexpr int vlen = 0;
static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
static_assert(MC % MR == 0, "MC must be a multiple of MR.");
static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};
template <>
struct BlockSize<double>
{
static constexpr int MC = BS_D_MC;
static constexpr int KC = BS_D_KC;
static constexpr int NC = BS_D_NC;
static constexpr int MR = BS_D_MR;
static constexpr int NR = BS_D_NR;
static constexpr int rwidth = SIMD_REGISTER_WIDTH;
static constexpr int align = rwidth / 8;
static constexpr int vlen = rwidth / (8*sizeof(double));
static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
static_assert(MC % MR == 0, "MC must be a multiple of MR.");
static_assert(NC % NR == 0, "NC must be a multiple of NR.");
static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane.");
};
//-- aux routines --------------------------------------------------------------
template <typename Index, typename Alpha, typename TX, typename TY>
void
geaxpy(Index m, Index n,
const Alpha &alpha,
const TX *X, Index incRowX, Index incColX,
TY *Y, Index incRowY, Index incColY)
{
for (Index j=0; j<n; ++j) {
for (Index i=0; i<m; ++i) {
Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
}
}
}
template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
const Alpha &alpha,
TX *X, Index incRowX, Index incColX)
{
if (alpha!=Alpha(0)) {
for (Index j=0; j<n; ++j) {
for (Index i=0; i<m; ++i) {
X[i*incRowX+j*incColX] *= alpha;
}
}
} else {
for (Index j=0; j<n; ++j) {
for (Index i=0; i<m; ++i) {
X[i*incRowX+j*incColX] = Alpha(0);
}
}
}
}
//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<BlockSize<T>::vlen != 0,
void>::type
ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
T *C, Index incRowC, Index incColC)
{
typedef T vx __attribute__((vector_size (BlockSize<T>::rwidth/8)));
static constexpr Index vlen = BlockSize<T>::vlen;
static constexpr Index MR = BlockSize<T>::MR;
static constexpr Index NR = BlockSize<T>::NR/vlen;
A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align);
B = (const T*) __builtin_assume_aligned (B, BlockSize<T>::align);
vx P[MR*NR] = {};
for (Index l=0; l<kc; ++l) {
const vx *b = (const vx *)B;
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<NR; ++j) {
P[i*NR+j] += A[i]*b[j];
}
}
A += MR;
B += vlen*NR;
}
if (alpha!=T(1)) {
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<NR; ++j) {
P[i*NR+j] *= alpha;
}
}
}
if (beta!=T(0)) {
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<NR; ++j) {
const T *p = (const T *) &P[i*NR+j];
for (Index j1=0; j1<vlen; ++j1) {
C[i*incRowC+(j*vlen+j1)*incColC] *= beta;
C[i*incRowC+(j*vlen+j1)*incColC] += p[j1];
}
}
}
} else {
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<NR; ++j) {
const T *p = (const T *) &P[i*NR+j];
for (Index j1=0; j1<vlen; ++j1) {
C[i*incRowC+(j*vlen+j1)*incColC] = p[j1];
}
}
}
}
}
//-- Macro Kernel --------------------------------------------------------------
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
T alpha,
const T *A, const T *B,
Beta beta,
TC *C, Index incRowC, Index incColC)
{
const Index MR = BlockSize<T>::MR;
const Index NR = BlockSize<T>::NR;
const Index mp = (mc+MR-1) / MR;
const Index np = (nc+NR-1) / NR;
const Index mr_ = mc % MR;
const Index nr_ = nc % NR;
T C_[MR*NR];
#pragma omp parallel for
for (Index j=0; j<np; ++j) {
const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
for (Index i=0; i<mp; ++i) {
const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
if (mr==MR && nr==NR) {
ugemm(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
beta,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
} else {
ugemm(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
T(0),
C_, Index(1), MR);
gescal(mr, nr, beta,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
geaxpy(mr, nr, T(1), C_, Index(1), MR,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
}
}
}
}
//-- Packing blocks ------------------------------------------------------------
template <typename Index, typename TA, typename T>
void
pack_A(Index mc, Index kc,
const TA *A, Index incRowA, Index incColA,
T *p)
{
Index MR = BlockSize<T>::MR;
Index mp = (mc+MR-1) / MR;
for (Index j=0; j<kc; ++j) {
for (Index l=0; l<mp; ++l) {
for (Index i0=0; i0<MR; ++i0) {
Index i = l*MR + i0;
Index nu = l*MR*kc + j*MR + i0;
p[nu] = (i<mc) ? A[i*incRowA+j*incColA]
: T(0);
}
}
}
}
template <typename Index, typename TB, typename T>
void
pack_B(Index kc, Index nc,
const TB *B, Index incRowB, Index incColB,
T *p)
{
Index NR = BlockSize<T>::NR;
Index np = (nc+NR-1) / NR;
for (Index l=0; l<np; ++l) {
for (Index j0=0; j0<NR; ++j0) {
for (Index i=0; i<kc; ++i) {
Index j = l*NR+j0;
Index nu = l*NR*kc + i*NR + j0;
p[nu] = (j<nc) ? B[i*incRowB+j*incColB]
: T(0);
}
}
}
}
//-- Frame routine -------------------------------------------------------------
template <typename Index, typename Alpha,
typename TA, typename TB,
typename Beta,
typename TC>
void
gemm(Index m, Index n, Index k,
Alpha alpha,
const TA *A, Index incRowA, Index incColA,
const TB *B, Index incRowB, Index incColB,
Beta beta,
TC *C, Index incRowC, Index incColC)
{
typedef typename std::common_type<Alpha, TA, TB>::type T;
const Index MC = BlockSize<T>::MC;
const Index NC = BlockSize<T>::NC;
const Index MR = BlockSize<T>::MR;
const Index NR = BlockSize<T>::NR;
const Index KC = BlockSize<T>::KC;
const Index mb = (m+MC-1) / MC;
const Index nb = (n+NC-1) / NC;
const Index kb = (k+KC-1) / KC;
const Index mc_ = m % MC;
const Index nc_ = n % NC;
const Index kc_ = k % KC;
T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR));
T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR));
if (alpha==Alpha(0) || k==0) {
gescal(m, n, beta, C, incRowC, incColC);
return;
}
for (Index j=0; j<nb; ++j) {
Index nc = (j!=nb-1 || nc_==0) ? NC : nc_;
for (Index l=0; l<kb; ++l) {
Index kc = (l!=kb-1 || kc_==0) ? KC : kc_;
Beta beta_ = (l==0) ? beta : Beta(1);
pack_B(kc, nc,
&B[l*KC*incRowB+j*NC*incColB],
incRowB, incColB,
B_);
for (Index i=0; i<mb; ++i) {
Index mc = (i!=mb-1 || mc_==0) ? MC : mc_;
pack_A(mc, kc,
&A[i*MC*incRowA+l*KC*incColA],
incRowA, incColA,
A_);
mgemm(mc, nc, kc,
T(alpha), A_, B_, beta_,
&C[i*MC*incRowC+j*NC*incColC],
incRowC, incColC);
}
}
}
free_(A_);
free_(B_);
}
//------------------------------------------------------------------------------
void myprod(double *c, const double* a, const double* b, int N) {
gemm(N, N, N, 1.0, a, 1, N, b, 1, N, 0.0, c, 1, N);
}
int main(int argc, char* argv[]) {
int N = atoi(argv[1]);
int tries = 4;
int rep = std::max<int>(1,10000000/N/N/N);
Eigen::MatrixXd a_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd b_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd c_E(N,N);
Eigen::BenchTimer t1, t2;
BENCH(t1, tries, rep, c_E.noalias() = a_E*b_E );
BENCH(t2, tries, rep, myprod(c_E.data(), a_E.data(), b_E.data(), N));
std::cout << "Time taken by Eigen is: " << t1.best() << "\n";
std::cout << "Time taken by for-loop is: " << t2.best() << "\n\n";
}
下面是 C++ 实现比较 Eigen 和 For Loop 执行矩阵-矩阵乘积所花费的时间。 For 循环已经过优化以最大限度地减少缓存未命中。 for 循环最初比 Eigen 快,但最终变得更慢(对于 500 x 500 矩阵高达 2 倍)。我还应该怎么做才能与 Eigen 竞争?阻塞是更好的 Eigen 性能的原因吗?如果是这样,我应该如何为 for 循环添加阻塞?
#include<iostream>
#include<Eigen/Dense>
#include<ctime>
int main(int argc, char* argv[]) {
srand(time(NULL));
// Input the size of the matrix from the user
int N = atoi(argv[1]);
int M = N*N;
// The matrices stored as row-wise vectors
double a[M];
double b[M];
double c[M];
// Initializing Eigen Matrices
Eigen::MatrixXd a_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd b_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd c_E(N,N);
double CPS = CLOCKS_PER_SEC;
clock_t start, end;
// Matrix vector product by Eigen
start = clock();
c_E = a_E*b_E;
end = clock();
std::cout << "\nTime taken by Eigen is: " << (end-start)/CPS << "\n";
// Initializing For loop vectors
int count = 0;
for (int j=0; j<N; ++j) {
for (int k=0; k<N; ++k) {
a[count] = a_E(j,k);
b[count] = b_E(j,k);
++count;
}
}
// Matrix vector product by For loop
start = clock();
count = 0;
int count1, count2;
for (int j=0; j<N; ++j) {
count1 = j*N;
for (int k=0; k<N; ++k) {
c[count] = a[count1]*b[k];
++count;
}
}
for (int j=0; j<N; ++j) {
count2 = N;
for (int l=1; l<N; ++l) {
count = j*N;
count1 = count+l;
for (int k=0; k<N; ++k) {
c[count]+=a[count1]*b[count2];
++count;
++count2;
}
}
}
end = clock();
std::cout << "\nTime taken by for-loop is: " << (end-start)/CPS << "\n";
}
我可能会建议两个简单的优化。
1) 对其进行矢量化。如果您使用内联汇编对其进行矢量化或编写汇编过程会更好,但您也可以使用编译器内部函数。您甚至可以让编译器对循环进行矢量化,但有时很难编写适当的循环以由编译器进行矢量化。
2) 使其平行。尝试使用 OpenMP。
编译器已经很好地矢量化了您的代码。更高性能的关键是分层阻塞,以优化寄存器和不同级别缓存的使用。部分循环展开对于改进指令流水线也至关重要。达到 Eigen 产品的性能需要大量的努力和调整。
还应注意,您的基准测试略有偏差且不完全可靠。这是一个更可靠的版本(你需要完整的 Eigen 源代码才能获得 bench/BenchTimer.h
):
#include<iostream>
#include<Eigen/Dense>
#include<bench/BenchTimer.h>
void myprod(double *c, const double* a, const double* b, int N) {
int count = 0;
int count1, count2;
for (int j=0; j<N; ++j) {
count1 = j*N;
for (int k=0; k<N; ++k) {
c[count] = a[count1]*b[k];
++count;
}
}
for (int j=0; j<N; ++j) {
count2 = N;
for (int l=1; l<N; ++l) {
count = j*N;
count1 = count+l;
for (int k=0; k<N; ++k) {
c[count]+=a[count1]*b[count2];
++count;
++count2;
}
}
}
}
int main(int argc, char* argv[]) {
int N = atoi(argv[1]);
int tries = 4;
int rep = std::max<int>(1,10000000/N/N/N);
Eigen::MatrixXd a_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd b_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd c_E(N,N);
Eigen::BenchTimer t1, t2;
BENCH(t1, tries, rep, c_E.noalias() = a_E*b_E );
BENCH(t2, tries, rep, myprod(c_E.data(), a_E.data(), b_E.data(), N));
std::cout << "\nTime taken by Eigen is: " << t1.best() << "\n";
std::cout << "\nTime taken by for-loop is: " << t2.best() << "\n";
}
编译3.3-beta1
并启用FMA(-mfma
),然后差距变得更大,几乎是N=2000
的一个数量级。
无需费解如何实现矩阵-矩阵乘积的高性能实现。事实上,我们需要更多的人了解它,以应对未来高性能计算的挑战。为了进入这个主题,阅读 BLIS: A Framework for Rapidly Instantiating BLAS Functionality 是一个很好的起点。
所以为了揭秘和回答问题(How to write a matrix matrix product that can competited with Eigen)我把ggael贴的代码加长到总共400行。我刚刚在 AVX 机器(Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz)上对其进行了测试。这里有一些结果:
g++-5.3 -O3 -DNDEBUG -std=c++11 -mavx -m64 -I ../eigen.3.2.8/ gemm.cc -lrt
lehn@heim:~/work/test_eigen$ ./a.out 500
Time taken by Eigen is: 0.0190425
Time taken by for-loop is: 0.0121688
lehn@heim:~/work/test_eigen$ ./a.out 1000
Time taken by Eigen is: 0.147991
Time taken by for-loop is: 0.0959097
lehn@heim:~/work/test_eigen$ ./a.out 1500
Time taken by Eigen is: 0.492858
Time taken by for-loop is: 0.322442
lehn@heim:~/work/test_eigen$ ./a.out 5000
Time taken by Eigen is: 18.3666
Time taken by for-loop is: 12.1023
如果你有 FMA,你可以编译
g++-5.3 -O3 -DNDEBUG -std=c++11 -mfma -m64 -I ../eigen.3.2.8/ -DHAVE_FMA gemm.cc -lrt
如果你还想用 openMP 多线程也用 -fopenmp
这里是基于BLIS论文思路的完整代码。它是独立的,除了它需要完整的 Eigen 源文件,正如 ggael 已经指出的那样:
#include<iostream>
#include<Eigen/Dense>
#include<bench/BenchTimer.h>
#if defined(_OPENMP)
#include <omp.h>
#endif
//-- malloc with alignment --------------------------------------------------------
void *
malloc_(std::size_t alignment, std::size_t size)
{
alignment = std::max(alignment, alignof(void *));
size += alignment;
void *ptr = std::malloc(size);
void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1));
void **vp = (void**) ptr2 - 1;
*vp = ptr;
return ptr2;
}
void
free_(void *ptr)
{
std::free(*((void**)ptr-1));
}
//-- Config --------------------------------------------------------------------
// SIMD-Register width in bits
// SSE: 128
// AVX/FMA: 256
// AVX-512: 512
#ifndef SIMD_REGISTER_WIDTH
#define SIMD_REGISTER_WIDTH 256
#endif
#ifdef HAVE_FMA
# ifndef BS_D_MR
# define BS_D_MR 4
# endif
# ifndef BS_D_NR
# define BS_D_NR 12
# endif
# ifndef BS_D_MC
# define BS_D_MC 256
# endif
# ifndef BS_D_KC
# define BS_D_KC 512
# endif
# ifndef BS_D_NC
# define BS_D_NC 4092
# endif
#endif
#ifndef BS_D_MR
#define BS_D_MR 4
#endif
#ifndef BS_D_NR
#define BS_D_NR 8
#endif
#ifndef BS_D_MC
#define BS_D_MC 256
#endif
#ifndef BS_D_KC
#define BS_D_KC 256
#endif
#ifndef BS_D_NC
#define BS_D_NC 4096
#endif
template <typename T>
struct BlockSize
{
static constexpr int MC = 64;
static constexpr int KC = 64;
static constexpr int NC = 256;
static constexpr int MR = 8;
static constexpr int NR = 8;
static constexpr int rwidth = 0;
static constexpr int align = alignof(T);
static constexpr int vlen = 0;
static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
static_assert(MC % MR == 0, "MC must be a multiple of MR.");
static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};
template <>
struct BlockSize<double>
{
static constexpr int MC = BS_D_MC;
static constexpr int KC = BS_D_KC;
static constexpr int NC = BS_D_NC;
static constexpr int MR = BS_D_MR;
static constexpr int NR = BS_D_NR;
static constexpr int rwidth = SIMD_REGISTER_WIDTH;
static constexpr int align = rwidth / 8;
static constexpr int vlen = rwidth / (8*sizeof(double));
static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
static_assert(MC % MR == 0, "MC must be a multiple of MR.");
static_assert(NC % NR == 0, "NC must be a multiple of NR.");
static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane.");
};
//-- aux routines --------------------------------------------------------------
template <typename Index, typename Alpha, typename TX, typename TY>
void
geaxpy(Index m, Index n,
const Alpha &alpha,
const TX *X, Index incRowX, Index incColX,
TY *Y, Index incRowY, Index incColY)
{
for (Index j=0; j<n; ++j) {
for (Index i=0; i<m; ++i) {
Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
}
}
}
template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
const Alpha &alpha,
TX *X, Index incRowX, Index incColX)
{
if (alpha!=Alpha(0)) {
for (Index j=0; j<n; ++j) {
for (Index i=0; i<m; ++i) {
X[i*incRowX+j*incColX] *= alpha;
}
}
} else {
for (Index j=0; j<n; ++j) {
for (Index i=0; i<m; ++i) {
X[i*incRowX+j*incColX] = Alpha(0);
}
}
}
}
//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<BlockSize<T>::vlen != 0,
void>::type
ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
T *C, Index incRowC, Index incColC)
{
typedef T vx __attribute__((vector_size (BlockSize<T>::rwidth/8)));
static constexpr Index vlen = BlockSize<T>::vlen;
static constexpr Index MR = BlockSize<T>::MR;
static constexpr Index NR = BlockSize<T>::NR/vlen;
A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align);
B = (const T*) __builtin_assume_aligned (B, BlockSize<T>::align);
vx P[MR*NR] = {};
for (Index l=0; l<kc; ++l) {
const vx *b = (const vx *)B;
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<NR; ++j) {
P[i*NR+j] += A[i]*b[j];
}
}
A += MR;
B += vlen*NR;
}
if (alpha!=T(1)) {
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<NR; ++j) {
P[i*NR+j] *= alpha;
}
}
}
if (beta!=T(0)) {
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<NR; ++j) {
const T *p = (const T *) &P[i*NR+j];
for (Index j1=0; j1<vlen; ++j1) {
C[i*incRowC+(j*vlen+j1)*incColC] *= beta;
C[i*incRowC+(j*vlen+j1)*incColC] += p[j1];
}
}
}
} else {
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<NR; ++j) {
const T *p = (const T *) &P[i*NR+j];
for (Index j1=0; j1<vlen; ++j1) {
C[i*incRowC+(j*vlen+j1)*incColC] = p[j1];
}
}
}
}
}
//-- Macro Kernel --------------------------------------------------------------
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
T alpha,
const T *A, const T *B,
Beta beta,
TC *C, Index incRowC, Index incColC)
{
const Index MR = BlockSize<T>::MR;
const Index NR = BlockSize<T>::NR;
const Index mp = (mc+MR-1) / MR;
const Index np = (nc+NR-1) / NR;
const Index mr_ = mc % MR;
const Index nr_ = nc % NR;
T C_[MR*NR];
#pragma omp parallel for
for (Index j=0; j<np; ++j) {
const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
for (Index i=0; i<mp; ++i) {
const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
if (mr==MR && nr==NR) {
ugemm(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
beta,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
} else {
ugemm(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
T(0),
C_, Index(1), MR);
gescal(mr, nr, beta,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
geaxpy(mr, nr, T(1), C_, Index(1), MR,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
}
}
}
}
//-- Packing blocks ------------------------------------------------------------
template <typename Index, typename TA, typename T>
void
pack_A(Index mc, Index kc,
const TA *A, Index incRowA, Index incColA,
T *p)
{
Index MR = BlockSize<T>::MR;
Index mp = (mc+MR-1) / MR;
for (Index j=0; j<kc; ++j) {
for (Index l=0; l<mp; ++l) {
for (Index i0=0; i0<MR; ++i0) {
Index i = l*MR + i0;
Index nu = l*MR*kc + j*MR + i0;
p[nu] = (i<mc) ? A[i*incRowA+j*incColA]
: T(0);
}
}
}
}
template <typename Index, typename TB, typename T>
void
pack_B(Index kc, Index nc,
const TB *B, Index incRowB, Index incColB,
T *p)
{
Index NR = BlockSize<T>::NR;
Index np = (nc+NR-1) / NR;
for (Index l=0; l<np; ++l) {
for (Index j0=0; j0<NR; ++j0) {
for (Index i=0; i<kc; ++i) {
Index j = l*NR+j0;
Index nu = l*NR*kc + i*NR + j0;
p[nu] = (j<nc) ? B[i*incRowB+j*incColB]
: T(0);
}
}
}
}
//-- Frame routine -------------------------------------------------------------
template <typename Index, typename Alpha,
typename TA, typename TB,
typename Beta,
typename TC>
void
gemm(Index m, Index n, Index k,
Alpha alpha,
const TA *A, Index incRowA, Index incColA,
const TB *B, Index incRowB, Index incColB,
Beta beta,
TC *C, Index incRowC, Index incColC)
{
typedef typename std::common_type<Alpha, TA, TB>::type T;
const Index MC = BlockSize<T>::MC;
const Index NC = BlockSize<T>::NC;
const Index MR = BlockSize<T>::MR;
const Index NR = BlockSize<T>::NR;
const Index KC = BlockSize<T>::KC;
const Index mb = (m+MC-1) / MC;
const Index nb = (n+NC-1) / NC;
const Index kb = (k+KC-1) / KC;
const Index mc_ = m % MC;
const Index nc_ = n % NC;
const Index kc_ = k % KC;
T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR));
T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR));
if (alpha==Alpha(0) || k==0) {
gescal(m, n, beta, C, incRowC, incColC);
return;
}
for (Index j=0; j<nb; ++j) {
Index nc = (j!=nb-1 || nc_==0) ? NC : nc_;
for (Index l=0; l<kb; ++l) {
Index kc = (l!=kb-1 || kc_==0) ? KC : kc_;
Beta beta_ = (l==0) ? beta : Beta(1);
pack_B(kc, nc,
&B[l*KC*incRowB+j*NC*incColB],
incRowB, incColB,
B_);
for (Index i=0; i<mb; ++i) {
Index mc = (i!=mb-1 || mc_==0) ? MC : mc_;
pack_A(mc, kc,
&A[i*MC*incRowA+l*KC*incColA],
incRowA, incColA,
A_);
mgemm(mc, nc, kc,
T(alpha), A_, B_, beta_,
&C[i*MC*incRowC+j*NC*incColC],
incRowC, incColC);
}
}
}
free_(A_);
free_(B_);
}
//------------------------------------------------------------------------------
void myprod(double *c, const double* a, const double* b, int N) {
gemm(N, N, N, 1.0, a, 1, N, b, 1, N, 0.0, c, 1, N);
}
int main(int argc, char* argv[]) {
int N = atoi(argv[1]);
int tries = 4;
int rep = std::max<int>(1,10000000/N/N/N);
Eigen::MatrixXd a_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd b_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd c_E(N,N);
Eigen::BenchTimer t1, t2;
BENCH(t1, tries, rep, c_E.noalias() = a_E*b_E );
BENCH(t2, tries, rep, myprod(c_E.data(), a_E.data(), b_E.data(), N));
std::cout << "Time taken by Eigen is: " << t1.best() << "\n";
std::cout << "Time taken by for-loop is: " << t2.best() << "\n\n";
}