一维数组是否比本征动态向量快?

Are 1d arrays faster than Eigen dynamic vectors?

我正在使用大型矩阵(100x100 到 3000x3000)进行一些计算(大量求和和多达 120 次矩阵向量乘法),我正在使用 Eigen 库处理我的向量和矩阵。

我想知道如何加快我的程序。我应该继续使用 Eigen、使用一维数组、使用 std::vector 还是使用其他库?

假设您不想迁移到 GPU 并且想要信任 Eigen 的 benchkmark page, Eigen is pretty fast. You specifically mentioned matrix vector products for which in the range you specified, Eigen is on top. Make sure that OpenMP is enabled as Eigen will take advantage of multiple cores. Likewise with vectorization

我做了一次比较,将 Eigen 与 ViennaCL 进行了比较(都在调试中):

Processing Unit | Mat Size | Exec time |    Calc        | Sparse %
CPU                   10       0,000     dense_mat*mat
GPU                   10       0,010     dense_mat*mat
CPU                  100       0,103     dense_mat*mat
GPU                  100       0,001     dense_mat*mat
CPU                 1000      97,232     dense_mat*mat
GPU                 1000       0,072     dense_mat*mat
CPU                   10       0,000     sparse_mat*mat   0,25
GPU                   10       0,007     sparse_mat*mat   0,25
CPU                   10       0,000     sparse_mat*mat   0,5
GPU                   10       0,000     sparse_mat*mat   0,5
CPU                   10       0,000     sparse_mat*mat   1
GPU                   10       0,000     sparse_mat*mat   1
CPU                  100       0,010     sparse_mat*mat   0,25
GPU                  100       0,001     sparse_mat*mat   0,25
CPU                  100       0,030     sparse_mat*mat   0,5
GPU                  100       0,001     sparse_mat*mat   0,5
CPU                  100       0,106     sparse_mat*mat   1
GPU                  100       0,001     sparse_mat*mat   1
CPU                 1000       7,131     sparse_mat*mat   0,25
GPU                 1000       0,073     sparse_mat*mat   0,25
CPU                 1000      26,628     sparse_mat*mat   0,5
GPU                 1000       0,072     sparse_mat*mat   0,5
CPU                 1000     101,389     sparse_mat*mat   1
GPU                 1000       0,072     sparse_mat*mat   1

使用的代码是:

//disable debug mechanisms to have a fair comparison with ublas:
#ifndef NDEBUG
#define NDEBUG
#endif


//
// include necessary system headers
//
#include <iostream>
#include <fstream>

//
// ublas includes
//
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/io.hpp>

#define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
//
// Eigen includes
//
#include <Eigen/Core>
#include <Eigen/Dense>

// Must be set if you want to use ViennaCL algorithms on ublas objects
#define VIENNACL_WITH_UBLAS 1

//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
//
#define VIENNACL_WITH_EIGEN 1
//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms using OPENCL
//
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif

//
// ViennaCL includes
//
#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/compressed_matrix.hpp"
#include "viennacl/linalg/sparse_matrix_operations.hpp"
#include "viennacl/linalg/ilu.hpp"
#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
#include "viennacl/linalg/direct_solve.hpp"
#include "viennacl/linalg/bicgstab.hpp"

// Some helper functions for this tutorial:
#include "examples/tutorial/Random.hpp"
#include "examples/tutorial/vector-io.hpp"

#include "examples/benchmarks/benchmark-utils.hpp"

#define BLAS3_MATRIX_SIZE   700

using namespace boost::numeric;


#ifndef VIENNACL_WITH_OPENCL
struct dummy
{
    std::size_t size() const { return 1; }
};
#endif


int main()
{
    typedef float     ScalarType;

    Timer timer;
    double exec_time;

    //
    // Initialize OpenCL device in the context
    //
    std::cout << std::endl << "--- initialized OpenGL device using ViennaCL ---" << std::endl;
#ifdef VIENNACL_WITH_OPENCL
    std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
#else
    dummy devices;
#endif

#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[0]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    //// Output results file
    std::ofstream resultsFile;
    resultsFile.open("resultsFile.txt");
    resultsFile << "Processing Unit,Mat Size,Exec time" << std::endl;
    std::cout << "Processing Unit,Mat Size,Exec time" << std::endl;


    // Start defining the dense matrices
    size_t points = 230000;
    size_t transRows=4;
    size_t transCols=3;
    // Other alternative: Use Eigen
    Eigen::MatrixXf eigen_A(points, transRows);
    Eigen::MatrixXf eigen_B(transRows, transCols);
    // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    std::vector<ScalarType> stl_A(points * transRows);
    std::vector<ScalarType> stl_B(transRows * transCols);
    // Set up the ViennaCL object
    viennacl::matrix<ScalarType> vcl_A(points, transRows);
    viennacl::matrix<ScalarType> vcl_B(transRows, transCols);
    // Fill dense matrix in normal memory
    for (unsigned int i = 0; i < points; ++i)
    {
        for (unsigned int j = 0; j < transRows; ++j)
        {
            stl_A[i*transRows + j] = random<ScalarType>();
            eigen_A(i,j) = stl_A[i*transRows + j];
        }
    }
    for (unsigned int i = 0; i < transRows; ++i)
    {
        for (unsigned int j = 0; j < transCols; ++j)
        {
            stl_B[i*transCols + j] = random<ScalarType>();
            eigen_B(i,j) = stl_A[i*transCols + j];
        }
    }


    // Perform the matrix*matrix product
    // On CPU
    Eigen::MatrixXf eigen_C(points, transCols);
    timer.start();
    eigen_C = eigen_A * eigen_B;
    exec_time = timer.get();
    resultsFile << "CPU," << points << "," << exec_time << std::endl;
    std::cout << "CPU," << points << "," << exec_time << std::endl;

    // on GPU
    timer.start();
    // Copy to gpu memory
    // Using fastcopy I can get ~500x speed improvement
    viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
    viennacl::fast_copy(&(stl_B[0]),&(stl_B[0]) + stl_B.size(),vcl_B);

    viennacl::matrix<ScalarType> vcl_C(points, transCols);
    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
    viennacl::backend::finish();

    exec_time = timer.get();
    resultsFile << "GPU," << points << "," << exec_time << std::endl;
    std::cout << "GPU," << points << "," << exec_time << std::endl;

    //// Start defining the dense matrices
    //for(size_t denseSize=10; denseSize<=1000; denseSize=denseSize*10)
    //{
    //  // Other alternative: Use Eigen
    //  Eigen::MatrixXf eigen_A(denseSize, denseSize);
    //  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //  std::vector<ScalarType> stl_A(denseSize * denseSize);
    //  // Set up the ViennaCL object
    //  viennacl::matrix<ScalarType> vcl_A(denseSize, denseSize);
    //  // Fill dense matrix in normal memory
    //  for (unsigned int i = 0; i < denseSize; ++i)
    //  {
    //      for (unsigned int j = 0; j < denseSize; ++j)
    //      {
    //          stl_A[i*denseSize + j] = random<ScalarType>();
    //          eigen_A(i,j) = stl_A[i*denseSize + j];
    //      }
    //  }


    //  // Perform the matrix*matrix product
    //  // On CPU
    //  Eigen::MatrixXf eigen_C(denseSize, denseSize);
    //  timer.start();
    //  eigen_C = eigen_A * eigen_A;
    //  exec_time = timer.get();
    //  resultsFile << "CPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "CPU," << denseSize << "," << exec_time << std::endl;

    //  // on GPU
    //  timer.start();
    //  // Copy to gpu memory
    //  // Using fastcopy I can get ~500x speed improvement
    //  viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //  viennacl::matrix<ScalarType> vcl_C(denseSize, denseSize);
    //  vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //  viennacl::backend::finish();

    //  exec_time = timer.get();
    //  resultsFile << "GPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "GPU," << denseSize << "," << exec_time << std::endl;
    //}

    //// Start defining the sparse matrices
    //for(size_t sparseSize=10; sparseSize<=1000; sparseSize=sparseSize*10)
    //{
    //  for(float sparsePerc=0.25; sparsePerc<=1.0; sparsePerc=2*sparsePerc)
    //  {
    //      // Other alternative: Use Eigen
    //      Eigen::SparseMatrix<float> eigen_A(sparseSize, sparseSize);
    //      // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //      std::vector<ScalarType> stl_A(sparseSize * sparseSize);
    //      // Set up the ViennaCL sparse matrix
    //      viennacl::matrix<ScalarType> vcl_A(sparseSize, sparseSize);
    //      // Fill dense matrix in normal memory
    //      for (size_t i=0; i<sparseSize; ++i)
    //      {
    //          for (size_t j=0; j<sparseSize; ++j)
    //          {
    //              if (((rand()%100)/100.0) <= sparsePerc)
    //              {
    //                  stl_A[i*sparseSize + j] = float(rand());
    //                  eigen_A.insert(i,j) = stl_A[i*sparseSize + j];
    //              }
    //          }
    //      }


    //      // Perform the matrix*matrix product
    //      // On CPU
    //      Eigen::SparseMatrix<float> eigen_C(sparseSize, sparseSize);
    //      timer.start();
    //      eigen_C = (eigen_A * eigen_A).pruned();
    //      exec_time = timer.get();
    //      resultsFile << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;

    //      // on GPU
    //      timer.start();
    //      // Copy to gpu memory
    //      // Using fastcopy I can get ~500x speed improvement
    //      viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //      viennacl::matrix<ScalarType> vcl_C(sparseSize, sparseSize);
    //      vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //      viennacl::backend::finish();

    //      exec_time = timer.get();
    //      resultsFile << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //  }
    //}

    resultsFile.close();


    //
    //  That's it. 
    //

    std::cout << "Press [ENTER] to exit " << std::endl;
    std::cin.get();

    return EXIT_SUCCESS;
}