如何提高与 std::vector 共享内存的数据映射 (Eigen::Map) 矩阵的 GEMM 性能?
How to improve GEMM performance on data-mapped (Eigen::Map) matrices sharing memory with an std::vector?
两个data-mapped matrices (Eigen::Map
) I notice a significant performance difference depending on how the memory was allocated. When using memory coming from a custom allocation, it's almost twice as fast compared to using (also aligned) memory coming from an std::vector
with data allocated also by Eigen::aligned_allocator
相乘时。
最小基准:
#include <Eigen/Core>
#include <Eigen/StdVector>
#include <chrono>
#include <iostream>
using Matrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
using Mapped = Eigen::Map<Matrix, Eigen::Aligned16>;
using aligned_vector = std::vector<float, Eigen::aligned_allocator<float>>;
void measure(const std::string& name, const Mapped& a, const Mapped& b, Mapped& c)
{
using namespace std::chrono;
const auto start_time_ns = high_resolution_clock::now().time_since_epoch().count();
const std::size_t runs = 10;
for (size_t i = 0; i < runs; ++i)
{
c.noalias() = a * b;
}
const auto end_time_ns = high_resolution_clock::now().time_since_epoch().count();
const auto elapsed_ms = (end_time_ns - start_time_ns) / 1000000;
std::cout << name << ": " << elapsed_ms << " ms" << std::endl;
}
int main()
{
unsigned int size_1 = 1;
unsigned int size_2 = 8192;
unsigned int size_3 = 16384;
aligned_vector a_vec(size_1 * size_2);
aligned_vector b_vec(size_2 * size_3);
aligned_vector c_vec(size_1 * size_3);
Mapped a_mapped_vec(a_vec.data(), size_1, size_2);
Mapped b_mapped_vec(b_vec.data(), size_2, size_3);
Mapped c_mapped_vec(c_vec.data(), size_1, size_3);
measure("Mapped vector memory", a_mapped_vec, b_mapped_vec, c_mapped_vec);
Eigen::aligned_allocator<float> allocator;
float* a_mem = allocator.allocate(size_1 * size_2);
float* b_mem = allocator.allocate(size_2 * size_3);
float* c_mem = allocator.allocate(size_1 * size_3);
Mapped a_mapped_mem(a_mem, size_1, size_2);
Mapped b_mapped_mem(b_mem, size_2, size_3);
Mapped c_mapped_mem(c_mem, size_1, size_3);
measure("Mapped custom memory", a_mapped_mem, b_mapped_mem, c_mapped_mem);
allocator.deallocate(a_mem, size_1 * size_2);
allocator.deallocate(b_mem, size_2 * size_3);
allocator.deallocate(c_mem, size_1 * size_3);
}
我机器上的输出(Core i5-6600):
Mapped vector memory: 661 ms
Mapped custom memory: 370 ms
Dockerfile
快速重现效果:
FROM ubuntu:20.04
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update
RUN apt-get install -y build-essential cmake git wget
RUN git clone -b '3.3.7' --single-branch --depth 1 https://github.com/eigenteam/eigen-git-mirror && cd eigen-git-mirror && mkdir -p build && cd build && cmake .. && make && make install && ln -s /usr/local/include/eigen3/Eigen /usr/local/include/Eigen
RUN wget https://gist.githubusercontent.com/Dobiasd/4b80aa0d5d19f8112656794ab94a061b/raw/c9cca8abc16ab35e71070aed5e779c7a8ebb3a7e/main.cpp
RUN g++ -std=c++14 -O3 -march=native main.cpp -o main
ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
RUN ./main
为什么会有这样的差异? (我认为 Eigen 不会知道内存来自哪里。)
对我来说更重要的是,如何提高来自 std::vector
的内存的性能?
正如 and 在评论中指出的那样,手动分配的版本不会初始化内存(与 std::vector
相反),访问它是未定义的行为,因此 MMU 可能会这样做一些聪明的东西,即实际上没有访问内存。
在第二部分也初始化内存时,两个版本表现出相似的性能:
float* a_mem = allocator.allocate(size_1 * size_2);
memset(a_mem, 0, size_1 * size_2 * sizeof(float));
float* b_mem = allocator.allocate(size_2 * size_3);
memset(b_mem, 0, size_2 * size_3 * sizeof(float));
float* c_mem = allocator.allocate(size_1 * size_3);
memset(c_mem, 0, size_1 * size_3 * sizeof(float));
Mapped vector memory: 654 ms
Mapped custom memory: 655 ms
两个data-mapped matrices (Eigen::Map
) I notice a significant performance difference depending on how the memory was allocated. When using memory coming from a custom allocation, it's almost twice as fast compared to using (also aligned) memory coming from an std::vector
with data allocated also by Eigen::aligned_allocator
相乘时。
最小基准:
#include <Eigen/Core>
#include <Eigen/StdVector>
#include <chrono>
#include <iostream>
using Matrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
using Mapped = Eigen::Map<Matrix, Eigen::Aligned16>;
using aligned_vector = std::vector<float, Eigen::aligned_allocator<float>>;
void measure(const std::string& name, const Mapped& a, const Mapped& b, Mapped& c)
{
using namespace std::chrono;
const auto start_time_ns = high_resolution_clock::now().time_since_epoch().count();
const std::size_t runs = 10;
for (size_t i = 0; i < runs; ++i)
{
c.noalias() = a * b;
}
const auto end_time_ns = high_resolution_clock::now().time_since_epoch().count();
const auto elapsed_ms = (end_time_ns - start_time_ns) / 1000000;
std::cout << name << ": " << elapsed_ms << " ms" << std::endl;
}
int main()
{
unsigned int size_1 = 1;
unsigned int size_2 = 8192;
unsigned int size_3 = 16384;
aligned_vector a_vec(size_1 * size_2);
aligned_vector b_vec(size_2 * size_3);
aligned_vector c_vec(size_1 * size_3);
Mapped a_mapped_vec(a_vec.data(), size_1, size_2);
Mapped b_mapped_vec(b_vec.data(), size_2, size_3);
Mapped c_mapped_vec(c_vec.data(), size_1, size_3);
measure("Mapped vector memory", a_mapped_vec, b_mapped_vec, c_mapped_vec);
Eigen::aligned_allocator<float> allocator;
float* a_mem = allocator.allocate(size_1 * size_2);
float* b_mem = allocator.allocate(size_2 * size_3);
float* c_mem = allocator.allocate(size_1 * size_3);
Mapped a_mapped_mem(a_mem, size_1, size_2);
Mapped b_mapped_mem(b_mem, size_2, size_3);
Mapped c_mapped_mem(c_mem, size_1, size_3);
measure("Mapped custom memory", a_mapped_mem, b_mapped_mem, c_mapped_mem);
allocator.deallocate(a_mem, size_1 * size_2);
allocator.deallocate(b_mem, size_2 * size_3);
allocator.deallocate(c_mem, size_1 * size_3);
}
我机器上的输出(Core i5-6600):
Mapped vector memory: 661 ms
Mapped custom memory: 370 ms
Dockerfile
快速重现效果:
FROM ubuntu:20.04
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update
RUN apt-get install -y build-essential cmake git wget
RUN git clone -b '3.3.7' --single-branch --depth 1 https://github.com/eigenteam/eigen-git-mirror && cd eigen-git-mirror && mkdir -p build && cd build && cmake .. && make && make install && ln -s /usr/local/include/eigen3/Eigen /usr/local/include/Eigen
RUN wget https://gist.githubusercontent.com/Dobiasd/4b80aa0d5d19f8112656794ab94a061b/raw/c9cca8abc16ab35e71070aed5e779c7a8ebb3a7e/main.cpp
RUN g++ -std=c++14 -O3 -march=native main.cpp -o main
ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
RUN ./main
为什么会有这样的差异? (我认为 Eigen 不会知道内存来自哪里。)
对我来说更重要的是,如何提高来自 std::vector
的内存的性能?
正如 std::vector
相反),访问它是未定义的行为,因此 MMU 可能会这样做一些聪明的东西,即实际上没有访问内存。
在第二部分也初始化内存时,两个版本表现出相似的性能:
float* a_mem = allocator.allocate(size_1 * size_2);
memset(a_mem, 0, size_1 * size_2 * sizeof(float));
float* b_mem = allocator.allocate(size_2 * size_3);
memset(b_mem, 0, size_2 * size_3 * sizeof(float));
float* c_mem = allocator.allocate(size_1 * size_3);
memset(c_mem, 0, size_1 * size_3 * sizeof(float));
Mapped vector memory: 654 ms
Mapped custom memory: 655 ms