将 Eigen MatrixXd 拆分为具有随机打乱行的固定大小批次的最佳方法
The optimal way to split Eigen MatrixXd into fixed-size batches with randomly shuffled rows
我有输入和目标数据表示为 MatrixXd (N x M) 和 VectorXd (N)。目标是创建大小为 K 的小批量,其中包含以相同方式打乱的输入和目标数据的子集。然后,ML 模型将循环处理这些小批量。您能否推荐如何通过尽可能少的复制来实现这一点(也许,通过代码示例)?
我尝试实现这种批处理
#include <algorithm>
#include <numeric>
#include <random>
#include <Eigen/Dense>
using Eigen::MatrixXd;
using Eigen::Ref;
using Eigen::VectorXd;
struct Batch {
const Ref<const MatrixXd> input;
const Ref<const VectorXd> target;
};
std::vector<Batch> generate_batches(const Ref<const MatrixXd> input, const Ref<const VectorXd> target, unsigned batch_size)
{
unsigned num_samples = input.rows();
unsigned num_batches = ceil(num_samples / (float)batch_size);
static std::default_random_engine engine;
std::vector<unsigned> idxs(num_samples);
std::iota(idxs.begin(), idxs.end(), 0);
std::shuffle(idxs.begin(), idxs.end(), engine);
std::vector<Batch> batches;
batches.reserve(num_batches);
auto idxs_begin = std::make_move_iterator(idxs.begin());
for (unsigned idx = 0; idx < num_batches; ++idx) {
int start = idx * batch_size;
int end = std::min(start + batch_size, num_samples);
std::vector<unsigned> batch_idxs(std::next(idxs_begin, start), std::next(idxs_begin, end));
batches.push_back({ input(batch_idxs, Eigen::all), target(batch_idxs) });
}
return batches;
}
Eigen 带有一个 Transpositions 类型,可以做到这一点。它通过交换行或列来工作 in-place。所以你可以一遍又一遍地打乱同一个矩阵。
#include <Eigen/Dense>
#include <algorithm>
// using std::min
#include <cassert>
#include <random>
// using std::default_random_engine, std::uniform_int_distribution
void shuffle_apply(Eigen::Ref<Eigen::MatrixXd> mat,
Eigen::Ref<Eigen::VectorXd> vec,
int generations, int batchsize)
{
// colwise is faster than rowwise
const Eigen::Index size = mat.cols();
assert(vec.size() == size);
using Transpositions = Eigen::Transpositions<
Eigen::Dynamic, Eigen::Dynamic, Eigen::Index>;
Transpositions transp(size);
Eigen::Index* transp_indices = transp.indices().data();
std::default_random_engine rng; // seed appropriately!
for(int gen = 0; gen < generations; ++gen) {
for(Eigen::Index i = 0; i < size; ++i) {
std::uniform_int_distribution<Eigen::Index> distr(i, size - 1);
transp_indices[i] = distr(rng);
}
mat = mat * transp; // operates in-place
vec = transp * vec; // transp on left side to shuffle rows, not cols
for(Eigen::Index start = 0; start < size; start += batchsize) {
const Eigen::Index curbatch = std::min<Eigen::Index>(
batchsize, size - start);
const auto mat_batch = mat.middleCols(start, curbatch);
const auto vec_batch = vec.segment(start, curbatch);
}
}
}
另见 和类似问题。
编辑:旧版本通过 std::shuffle 初始化索引,我认为这是错误的
这是第二个版本,它可能会提供更易于接受的界面。特别是矩阵和向量不用复制也能恢复原样
class BatchShuffle
{
using Transpositions = Eigen::Transpositions<
Eigen::Dynamic, Eigen::Dynamic, Eigen::Index>;
using Permutations = Eigen::PermutationMatrix<
Eigen::Dynamic, Eigen::Dynamic, Eigen::Index>;
Eigen::MatrixXd mat_;
Eigen::VectorXd vec_;
Transpositions cur_transp;
Permutations aggregated_permut;
public:
BatchShuffle(Eigen::MatrixXd mat, Eigen::VectorXd vec)
: mat_(std::move(mat)),
vec_(std::move(vec)),
cur_transp(this->mat_.cols()),
aggregated_permut(this->mat_.cols())
{
assert(this->vec_.size() == this->mat_.cols());
aggregated_permut.setIdentity();
}
Eigen::Index totalsize() const noexcept
{ return mat_.cols(); }
const Eigen::MatrixXd& mat() const noexcept
{ return mat_; }
const Eigen::VectorXd& vec() const noexcept
{ return vec_; }
template<class RandomNumberEngine>
void shuffle(RandomNumberEngine& rng)
{
Eigen::Index* indices = cur_transp.indices().data();
for(Eigen::Index i = 0, n = totalsize(); i < n; ++i) {
std::uniform_int_distribution<Eigen::Index> distr(i, n - 1);
indices[i] = distr(rng);
}
Permutations::IndicesType& aggregated = aggregated_permut.indices();
aggregated = cur_transp * aggregated;
mat_ = mat_ * cur_transp;
vec_ = cur_transp * vec_;
}
void BatchShuffle::restore_original()
{
const auto& inverse = aggregated_permut.inverse().eval();
mat_ = mat_ * inverse;
vec_ = inverse * vec_;
aggregated_permut.setIdentity();
}
};
void apply(const Eigen::Ref<const Eigen::MatrixXd>& mat,
const Eigen::Ref<const Eigen::VectorXd>& vec);
int main()
{
int rows = 1000, cols = 10000, batchsize = 100;
BatchShuffle batch(Eigen::MatrixXd::Random(rows, cols),
Eigen::VectorXd::Random(cols));
std::default_random_engine rng;
for(int i = 0; i < 100; ++i) {
batch.shuffle(rng);
for(Eigen::Index j = 0; j < batch.totalsize(); j += batchsize) {
Eigen::Index cursize =
std::min<Eigen::Index>(batchsize, batch.totalsize() - j);
apply(batch.mat().middleCols(j, cursize),
batch.vec().segment(j, cursize));
}
}
batch.restore_original();
}
同样,我选择使用矩阵 column-wise,这与您在尝试获取行的代码中不同。 Eigen 以 column-major 顺序(a.k.a。Fortran 顺序)存储其矩阵。采用行切片而不是列切片会显着减慢您对数据所做的几乎所有操作。所以我真的强烈建议你尽可能地转置你的输入生成和矩阵使用。
我有输入和目标数据表示为 MatrixXd (N x M) 和 VectorXd (N)。目标是创建大小为 K 的小批量,其中包含以相同方式打乱的输入和目标数据的子集。然后,ML 模型将循环处理这些小批量。您能否推荐如何通过尽可能少的复制来实现这一点(也许,通过代码示例)?
我尝试实现这种批处理
#include <algorithm>
#include <numeric>
#include <random>
#include <Eigen/Dense>
using Eigen::MatrixXd;
using Eigen::Ref;
using Eigen::VectorXd;
struct Batch {
const Ref<const MatrixXd> input;
const Ref<const VectorXd> target;
};
std::vector<Batch> generate_batches(const Ref<const MatrixXd> input, const Ref<const VectorXd> target, unsigned batch_size)
{
unsigned num_samples = input.rows();
unsigned num_batches = ceil(num_samples / (float)batch_size);
static std::default_random_engine engine;
std::vector<unsigned> idxs(num_samples);
std::iota(idxs.begin(), idxs.end(), 0);
std::shuffle(idxs.begin(), idxs.end(), engine);
std::vector<Batch> batches;
batches.reserve(num_batches);
auto idxs_begin = std::make_move_iterator(idxs.begin());
for (unsigned idx = 0; idx < num_batches; ++idx) {
int start = idx * batch_size;
int end = std::min(start + batch_size, num_samples);
std::vector<unsigned> batch_idxs(std::next(idxs_begin, start), std::next(idxs_begin, end));
batches.push_back({ input(batch_idxs, Eigen::all), target(batch_idxs) });
}
return batches;
}
Eigen 带有一个 Transpositions 类型,可以做到这一点。它通过交换行或列来工作 in-place。所以你可以一遍又一遍地打乱同一个矩阵。
#include <Eigen/Dense>
#include <algorithm>
// using std::min
#include <cassert>
#include <random>
// using std::default_random_engine, std::uniform_int_distribution
void shuffle_apply(Eigen::Ref<Eigen::MatrixXd> mat,
Eigen::Ref<Eigen::VectorXd> vec,
int generations, int batchsize)
{
// colwise is faster than rowwise
const Eigen::Index size = mat.cols();
assert(vec.size() == size);
using Transpositions = Eigen::Transpositions<
Eigen::Dynamic, Eigen::Dynamic, Eigen::Index>;
Transpositions transp(size);
Eigen::Index* transp_indices = transp.indices().data();
std::default_random_engine rng; // seed appropriately!
for(int gen = 0; gen < generations; ++gen) {
for(Eigen::Index i = 0; i < size; ++i) {
std::uniform_int_distribution<Eigen::Index> distr(i, size - 1);
transp_indices[i] = distr(rng);
}
mat = mat * transp; // operates in-place
vec = transp * vec; // transp on left side to shuffle rows, not cols
for(Eigen::Index start = 0; start < size; start += batchsize) {
const Eigen::Index curbatch = std::min<Eigen::Index>(
batchsize, size - start);
const auto mat_batch = mat.middleCols(start, curbatch);
const auto vec_batch = vec.segment(start, curbatch);
}
}
}
另见
编辑:旧版本通过 std::shuffle 初始化索引,我认为这是错误的
这是第二个版本,它可能会提供更易于接受的界面。特别是矩阵和向量不用复制也能恢复原样
class BatchShuffle
{
using Transpositions = Eigen::Transpositions<
Eigen::Dynamic, Eigen::Dynamic, Eigen::Index>;
using Permutations = Eigen::PermutationMatrix<
Eigen::Dynamic, Eigen::Dynamic, Eigen::Index>;
Eigen::MatrixXd mat_;
Eigen::VectorXd vec_;
Transpositions cur_transp;
Permutations aggregated_permut;
public:
BatchShuffle(Eigen::MatrixXd mat, Eigen::VectorXd vec)
: mat_(std::move(mat)),
vec_(std::move(vec)),
cur_transp(this->mat_.cols()),
aggregated_permut(this->mat_.cols())
{
assert(this->vec_.size() == this->mat_.cols());
aggregated_permut.setIdentity();
}
Eigen::Index totalsize() const noexcept
{ return mat_.cols(); }
const Eigen::MatrixXd& mat() const noexcept
{ return mat_; }
const Eigen::VectorXd& vec() const noexcept
{ return vec_; }
template<class RandomNumberEngine>
void shuffle(RandomNumberEngine& rng)
{
Eigen::Index* indices = cur_transp.indices().data();
for(Eigen::Index i = 0, n = totalsize(); i < n; ++i) {
std::uniform_int_distribution<Eigen::Index> distr(i, n - 1);
indices[i] = distr(rng);
}
Permutations::IndicesType& aggregated = aggregated_permut.indices();
aggregated = cur_transp * aggregated;
mat_ = mat_ * cur_transp;
vec_ = cur_transp * vec_;
}
void BatchShuffle::restore_original()
{
const auto& inverse = aggregated_permut.inverse().eval();
mat_ = mat_ * inverse;
vec_ = inverse * vec_;
aggregated_permut.setIdentity();
}
};
void apply(const Eigen::Ref<const Eigen::MatrixXd>& mat,
const Eigen::Ref<const Eigen::VectorXd>& vec);
int main()
{
int rows = 1000, cols = 10000, batchsize = 100;
BatchShuffle batch(Eigen::MatrixXd::Random(rows, cols),
Eigen::VectorXd::Random(cols));
std::default_random_engine rng;
for(int i = 0; i < 100; ++i) {
batch.shuffle(rng);
for(Eigen::Index j = 0; j < batch.totalsize(); j += batchsize) {
Eigen::Index cursize =
std::min<Eigen::Index>(batchsize, batch.totalsize() - j);
apply(batch.mat().middleCols(j, cursize),
batch.vec().segment(j, cursize));
}
}
batch.restore_original();
}
同样,我选择使用矩阵 column-wise,这与您在尝试获取行的代码中不同。 Eigen 以 column-major 顺序(a.k.a。Fortran 顺序)存储其矩阵。采用行切片而不是列切片会显着减慢您对数据所做的几乎所有操作。所以我真的强烈建议你尽可能地转置你的输入生成和矩阵使用。