当向量中唯一元素的数量远小于向量大小时,有效地处理向量的每个唯一排列
Efficiently process each unique permutation of a vector when number of unique elements in vector is much smaller than vector size
在一个程序中,我需要对向量的每个唯一排列并行应用一个函数。向量的大小大约是 N=15
我已经有一个函数 void parallel_for_each_permutation
,我可以将它与 std::set 结合使用,以便只处理每个唯一排列一次。
这一切都适用于一般情况。但是,在我的用例中,每个向量的唯一元素 k 的数量非常有限,通常在 k=4 左右。这意味着我目前正在浪费时间一遍又一遍地构建相同的唯一排列,只是为了将其丢弃,因为它已经被处理过。
是否可以在这种特殊情况下处理所有唯一排列,而无需构造所有 N!排列?
示例用例:
#include <algorithm>
#include <thread>
#include <vector>
#include <mutex>
#include <numeric>
#include <set>
#include <iostream>
template<class Container1, class Container2>
struct Comp{
//compare element-wise less than
bool operator()(const Container1& l, const Container2& r) const{
auto pair = std::mismatch(l.begin(), l.end(), r.begin());
if(pair.first == l.end() && pair.second == r.end())
return false;
return *(pair.first) < *(pair.second);
}
};
template<class Container, class Func>
void parallel_for_each_permutation(const Container& container, int num_threads, Func func){
auto ithPermutation = [](int n, size_t i) -> std::vector<size_t>{
//
std::vector<size_t> fact(n);
std::vector<size_t> perm(n);
fact[0] = 1;
for(int k = 1; k < n; k++)
fact[k] = fact[k-1] * k;
for(int k = 0; k < n; k++){
perm[k] = i / fact[n-1-k];
i = i % fact[n-1-k];
}
for(int k = n-1; k > 0; k--){
for(int j = k-1; j >= 0; j--){
if(perm[j] <= perm[k])
perm[k]++;
}
}
return perm;
};
size_t totalNumPermutations = 1;
for(size_t i = 1; i <= container.size(); i++)
totalNumPermutations *= i;
std::vector<std::thread> threads;
for(int threadId = 0; threadId < num_threads; threadId++){
threads.emplace_back([&, threadId](){
const size_t firstPerm = size_t(float(threadId) * totalNumPermutations / num_threads);
const size_t last_excl = std::min(totalNumPermutations, size_t(float(threadId+1) * totalNumPermutations / num_threads));
Container permutation(container);
auto permIndices = ithPermutation(container.size(), firstPerm);
size_t count = firstPerm;
do{
for(int i = 0; i < int(permIndices.size()); i++){
permutation[i] = container[permIndices[i]];
}
func(threadId, permutation);
std::next_permutation(permIndices.begin(), permIndices.end());
++count;
}while(count < last_excl);
});
}
for(auto& thread : threads)
thread.join();
}
template<class Container, class Func>
void parallel_for_each_unique_permutation(const Container& container, Func func){
using Comparator = Comp<Container, Container>;
constexpr int numThreads = 4;
std::set<Container, Comparator> uniqueProcessedPermutations(Comparator{});
std::mutex m;
parallel_for_each_permutation(
container,
numThreads,
[&](int threadId, const auto& permutation){
{
std::lock_guard<std::mutex> lg(m);
if(uniqueProcessedPermutations.count(permutation) > 0){
return;
}else{
uniqueProcessedPermutations.insert(permutation);
}
}
func(permutation);
}
);
}
int main(){
std::vector<int> vector1{1,1,1,1,2,3,2,2,3,3,1};
auto func = [](const auto& vec){return;};
parallel_for_each_unique_permutation(vector1, func);
}
您必须使用的排列在组合学领域被称为多重排列.
它们被描述为例如on The Combinatorial Object Server
在 this paper by professor Tadao Takaoka.
中有更详细的解释
您有 some related Python code and some C++ code in the FXT open source library.
您可以考虑在您的问题中添加 "multiset" 和 "combinatorics" 标签。
一种可能性是从 FXT 库中借用 (header-only) algorithmic code,它为那些 多集排列 [=81] 提供了一个简单的生成器 class =].
性能等级:
在 15 objects,{1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}
的测试向量上使用 FXT 算法,可以在普通 vanilla Intel x86 上在不到 2 秒的时间内生成所有关联的 12,612,600 "permutations" -64机;这没有诊断文本 I/O 并且没有任何优化尝试。
该算法只生成所需的那些 "permutations",仅此而已。因此不再需要生成全部 15 个! "raw" 排列,也不使用互斥来更新共享数据结构以进行过滤。
用于生成排列的适配器 class:
我将在下面尝试为适配器 class 提供代码,它允许您的应用程序使用 FXT 算法,同时将依赖项包含在单个实现文件中。这样,代码将有望更好地适合您的应用程序。想想 FXT 的 ulong 类型和原始指针的使用,而不是代码中的 std::vector<std::size_t>
。此外,FXT 是一个非常广泛的库。
Header 文件 "adaptor" class:
// File: MSetPermGen.h
#ifndef MSET_PERM_GEN_H
#define MSET_PERM_GEN_H
#include <iostream>
#include <vector>
class MSetPermGenImpl; // from algorithmic backend
using IntVec = std::vector<int>;
using SizeVec = std::vector<std::size_t>;
// Generator class for multiset permutations:
class MSetPermGen {
public:
MSetPermGen(const IntVec& vec);
std::size_t getCycleLength() const;
bool forward(size_t incr);
bool next();
const SizeVec& getPermIndices() const;
const IntVec& getItems() const;
const IntVec& getItemValues() const;
private:
std::size_t cycleLength_;
MSetPermGenImpl* genImpl_; // implementation generator
IntVec itemValues_; // only once each
IntVec items_; // copy of ctor argument
SizeVec freqs_; // repetition counts
SizeVec state_; // array of indices in 0..n-1
};
#endif
class 构造函数完全采用主程序中提供的参数类型。当然,关键方法是next()
。您还可以使用 forward(incr)
方法一次将自动机移动几步。
示例客户端程序:
// File: test_main.cpp
#include <cassert>
#include "MSetPermGen.h"
using std::cout;
using std::cerr;
using std::endl;
// utility functions:
std::vector<int> getMSPermutation(const MSetPermGen& mspg)
{
std::vector<int> res;
auto indices = mspg.getPermIndices(); // always between 0 and n-1
auto values = mspg.getItemValues(); // whatever the user put in
std::size_t n = indices.size();
assert( n == items.size() );
res.reserve(n);
for (std::size_t i=0; i < n; i++) {
auto xi = indices[i];
res.push_back(values[xi]);
}
return res;
}
void printPermutation(const std::vector<int>& p, std::ostream& fh)
{
std::size_t n = p.size();
for (size_t i=0; i < n; i++)
fh << p[i] << " ";
fh << '\n';
}
int main(int argc, const char* argv[])
{
std::vector<int> vec0{1,1, 2,2,2}; // N=5
std::vector<int> vec1{1,1, 1,1, 2, 3, 2,2, 3,3, 1}; // N=11
std::vector<int> vec2{1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}; // N=15
MSetPermGen pg0{vec0};
MSetPermGen pg1{vec1};
MSetPermGen pg2{vec2};
auto pg = &pg0; // choice of 0, 1, 2 for sizing
auto cl = pg->getCycleLength();
auto permA = getMSPermutation(*pg);
printPermutation(permA, cout);
for (std::size_t pi=0; pi < (cl-1); pi++) {
pg->next();
auto permB = getMSPermutation(*pg);
printPermutation(permB, cout);
}
return EXIT_SUCCESS;
}
上述小程序的文本输出:
1 1 2 2 2
1 2 1 2 2
1 2 2 1 2
1 2 2 2 1
2 1 1 2 2
2 1 2 1 2
2 1 2 2 1
2 2 1 1 2
2 2 1 2 1
2 2 2 1 1
您只能从矢量 {1,1, 2,2,2} 中获得 10 个项目,因为 5! / (2! * 3!) = 120/(2*6) = 10.
适配器 class、MSetPermGen.cpp
的实现文件由两部分组成。第一部分是经过最少改编的 FXT 代码。第二部分是 MSetPermGen
class 正确的。
实现文件的第一部分:
// File: MSetPermGen.cpp - part 1 of 2 - FXT code
// -------------- Beginning of header-only FXT combinatorics code -----------
// This file is part of the FXT library.
// Copyright (C) 2010, 2012, 2014 Joerg Arndt
// License: GNU General Public License version 3 or later,
// see the file COPYING.txt in the main directory.
//-- https://www.jjj.de/fxt/
//-- https://fossies.org/dox/fxt-2018.07.03/mset-perm-lex_8h_source.html
#include <cstddef>
using ulong = std::size_t;
inline void swap2(ulong& xa, ulong& xb)
{
ulong save_xb = xb;
xb = xa;
xa = save_xb;
}
class mset_perm_lex
// Multiset permutations in lexicographic order, iterative algorithm.
{
public:
ulong k_; // number of different sorts of objects
ulong *r_; // number of elements '0' in r[0], '1' in r[1], ..., 'k-1' in r[k-1]
ulong n_; // number of objects
ulong *ms_; // multiset data in ms[0], ..., ms[n-1], sentinels at [-1] and [-2]
private: // have pointer data
mset_perm_lex(const mset_perm_lex&); // forbidden
mset_perm_lex & operator = (const mset_perm_lex&); // forbidden
public:
explicit mset_perm_lex(const ulong *r, ulong k)
{
k_ = k;
r_ = new ulong[k];
for (ulong j=0; j<k_; ++j) r_[j] = r[j]; // get buckets
n_ = 0;
for (ulong j=0; j<k_; ++j) n_ += r_[j];
ms_ = new ulong[n_+2];
ms_[0] = 0; ms_[1] = 1; // sentinels: ms[0] < ms[1]
ms_ += 2; // nota bene
first();
}
void first()
{
for (ulong j=0, i=0; j<k_; ++j)
for (ulong h=r_[j]; h!=0; --h, ++i)
ms_[i] = j;
}
~mset_perm_lex()
{
ms_ -= 2;
delete [] ms_;
delete [] r_;
}
const ulong * data() const { return ms_; }
ulong next()
// Return position of leftmost change,
// return n with last permutation.
{
// find rightmost pair with ms[i] < ms[i+1]:
const ulong n1 = n_ - 1;
ulong i = n1;
do { --i; } while ( ms_[i] >= ms_[i+1] ); // can read sentinel
if ( (long)i < 0 ) return n_; // last sequence is falling seq.
// find rightmost element ms[j] less than ms[i]:
ulong j = n1;
while ( ms_[i] >= ms_[j] ) { --j; }
swap2(ms_[i], ms_[j]);
// Here the elements ms[i+1], ..., ms[n-1] are a falling sequence.
// Reverse order to the right:
ulong r = n1;
ulong s = i + 1;
while ( r > s ) { swap2(ms_[r], ms_[s]); --r; ++s; }
return i;
}
};
// -------------- End of header-only FXT combinatorics code -----------
class 实现文件的第二部分:
// Second part of file MSetPermGen.cpp: non-FXT code
#include <cassert>
#include <tuple>
#include <map>
#include <iostream>
#include <cstdio>
#include "MSetPermGen.h"
using std::cout;
using std::cerr;
using std::endl;
class MSetPermGenImpl { // wrapper class
public:
MSetPermGenImpl(const SizeVec& freqs) : fg(freqs.data(), freqs.size())
{}
private:
mset_perm_lex fg;
friend class MSetPermGen;
};
static std::size_t fact(size_t n)
{
std::size_t f = 1;
for (std::size_t i = 1; i <= n; i++)
f = f*i;
return f;
}
MSetPermGen::MSetPermGen(const IntVec& vec) : items_(vec)
{
std::map<int,int> ma;
for (int i: vec) {
ma[i]++;
}
int item, freq;
for (const auto& p : ma) {
std::tie(item, freq) = p;
itemValues_.push_back(item);
freqs_.push_back(freq);
}
cycleLength_ = fact(items_.size());
for (auto i: freqs_)
cycleLength_ /= fact(i);
// create FXT-level generator:
genImpl_ = new MSetPermGenImpl(freqs_);
for (std::size_t i=0; i < items_.size(); i++)
state_.push_back(genImpl_->fg.ms_[i]);
}
std::size_t MSetPermGen::getCycleLength() const
{
return cycleLength_;
}
bool MSetPermGen::forward(size_t incr)
{
std::size_t n = items_.size();
std::size_t rc = 0;
// move forward state by brute force, could be improved:
for (std::size_t i=0; i < incr; i++)
rc = genImpl_->fg.next();
for (std::size_t j=0; j < n; j++)
state_[j] = genImpl_->fg.ms_[j];
return (rc != n);
}
bool MSetPermGen::next()
{
return forward(1);
}
const SizeVec& MSetPermGen::getPermIndices() const
{
return (this->state_);
}
const IntVec& MSetPermGen::getItems() const
{
return (this->items_);
}
const IntVec& MSetPermGen::getItemValues() const
{
return (this->itemValues_);
}
调整并行应用程序:
关于您的多线程应用程序,鉴于生成 "permutations" 的成本很低,您可以负担得起为每个线程创建一个生成器 object。
在启动实际计算之前,将每个生成器转发到其适当的初始位置,即步骤 thread_id * (cycleLength / num_threads)
。
我已尝试按照这些思路使您的代码适应此 MSetPermGen
class。请参阅下面的代码。
使用 3 个线程、大小为 15 的输入向量 {1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}
(提供 12,612,600 个排列)并启用所有诊断,修改后的并行程序运行时间不到 10 秒;关闭所有诊断后不到 2 秒。
修改后的并行程序:
#include <algorithm>
#include <thread>
#include <vector>
#include <atomic>
#include <mutex>
#include <numeric>
#include <set>
#include <iostream>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include "MSetPermGen.h"
using std::cout;
using std::endl;
// debug and instrumentation:
static std::atomic<size_t> permCounter;
static bool doManagePermCounter = true;
static bool doThreadLogfiles = true;
static bool doLogfileHeaders = true;
template<class Container, class Func>
void parallel_for_each_permutation(const Container& container, int numThreads, Func mfunc) {
MSetPermGen gen0(container);
std::size_t totalNumPermutations = gen0.getCycleLength();
std::size_t permShare = totalNumPermutations / numThreads;
if ((totalNumPermutations % numThreads) != 0)
permShare++;
std::cout << "totalNumPermutations: " << totalNumPermutations << std::endl;
std::vector<std::thread> threads;
for (int threadId = 0; threadId < numThreads; threadId++) {
threads.emplace_back([&, threadId]() {
// generate some per-thread logfile name
std::ostringstream fnss;
fnss << "thrlog_" << threadId << ".txt";
std::string fileName = fnss.str();
std::ofstream fh(fileName);
MSetPermGen thrGen(container);
const std::size_t firstPerm = permShare * threadId;
thrGen.forward(firstPerm);
const std::size_t last_excl = std::min(totalNumPermutations,
(threadId+1) * permShare);
if (doLogfileHeaders) {
fh << "MSG threadId: " << threadId << '\n';
fh << "MSG firstPerm: " << firstPerm << '\n';
fh << "MSG lastExcl : " << last_excl << '\n';
}
Container permutation(container);
auto values = thrGen.getItemValues();
auto permIndices = thrGen.getPermIndices();
auto nsz = permIndices.size();
std::size_t count = firstPerm;
do {
for (std::size_t i = 0; i < nsz; i++) {
permutation[i] = values[permIndices[i]];
}
mfunc(threadId, permutation);
if (doThreadLogfiles) {
for (std::size_t i = 0; i < nsz; i++)
fh << permutation[i] << ' ';
fh << '\n';
}
thrGen.next();
permIndices = thrGen.getPermIndices();
++count;
if (doManagePermCounter) {
permCounter++;
}
} while (count < last_excl);
fh.close();
});
}
for(auto& thread : threads)
thread.join();
}
template<class Container, class Func>
void parallel_for_each_unique_permutation(const Container& container, Func func) {
constexpr int numThreads = 3;
parallel_for_each_permutation(
container,
numThreads,
[&](int threadId, const auto& permutation){
// no longer need any mutual exclusion
func(permutation);
}
);
}
int main()
{
std::vector<int> vector1{1,1,1,1,2,3,2,2,3,3,1}; // N=11
std::vector<int> vector0{1,1, 2,2,2}; // N=5
std::vector<int> vector2{1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}; // N=15
auto func = [](const auto& vec) { return; };
permCounter.store(0);
parallel_for_each_unique_permutation(vector2, func);
auto finalPermCounter = permCounter.load();
cout << "FinalPermCounter = " << finalPermCounter << endl;
}
在一个程序中,我需要对向量的每个唯一排列并行应用一个函数。向量的大小大约是 N=15
我已经有一个函数 void parallel_for_each_permutation
,我可以将它与 std::set 结合使用,以便只处理每个唯一排列一次。
这一切都适用于一般情况。但是,在我的用例中,每个向量的唯一元素 k 的数量非常有限,通常在 k=4 左右。这意味着我目前正在浪费时间一遍又一遍地构建相同的唯一排列,只是为了将其丢弃,因为它已经被处理过。
是否可以在这种特殊情况下处理所有唯一排列,而无需构造所有 N!排列?
示例用例:
#include <algorithm>
#include <thread>
#include <vector>
#include <mutex>
#include <numeric>
#include <set>
#include <iostream>
template<class Container1, class Container2>
struct Comp{
//compare element-wise less than
bool operator()(const Container1& l, const Container2& r) const{
auto pair = std::mismatch(l.begin(), l.end(), r.begin());
if(pair.first == l.end() && pair.second == r.end())
return false;
return *(pair.first) < *(pair.second);
}
};
template<class Container, class Func>
void parallel_for_each_permutation(const Container& container, int num_threads, Func func){
auto ithPermutation = [](int n, size_t i) -> std::vector<size_t>{
//
std::vector<size_t> fact(n);
std::vector<size_t> perm(n);
fact[0] = 1;
for(int k = 1; k < n; k++)
fact[k] = fact[k-1] * k;
for(int k = 0; k < n; k++){
perm[k] = i / fact[n-1-k];
i = i % fact[n-1-k];
}
for(int k = n-1; k > 0; k--){
for(int j = k-1; j >= 0; j--){
if(perm[j] <= perm[k])
perm[k]++;
}
}
return perm;
};
size_t totalNumPermutations = 1;
for(size_t i = 1; i <= container.size(); i++)
totalNumPermutations *= i;
std::vector<std::thread> threads;
for(int threadId = 0; threadId < num_threads; threadId++){
threads.emplace_back([&, threadId](){
const size_t firstPerm = size_t(float(threadId) * totalNumPermutations / num_threads);
const size_t last_excl = std::min(totalNumPermutations, size_t(float(threadId+1) * totalNumPermutations / num_threads));
Container permutation(container);
auto permIndices = ithPermutation(container.size(), firstPerm);
size_t count = firstPerm;
do{
for(int i = 0; i < int(permIndices.size()); i++){
permutation[i] = container[permIndices[i]];
}
func(threadId, permutation);
std::next_permutation(permIndices.begin(), permIndices.end());
++count;
}while(count < last_excl);
});
}
for(auto& thread : threads)
thread.join();
}
template<class Container, class Func>
void parallel_for_each_unique_permutation(const Container& container, Func func){
using Comparator = Comp<Container, Container>;
constexpr int numThreads = 4;
std::set<Container, Comparator> uniqueProcessedPermutations(Comparator{});
std::mutex m;
parallel_for_each_permutation(
container,
numThreads,
[&](int threadId, const auto& permutation){
{
std::lock_guard<std::mutex> lg(m);
if(uniqueProcessedPermutations.count(permutation) > 0){
return;
}else{
uniqueProcessedPermutations.insert(permutation);
}
}
func(permutation);
}
);
}
int main(){
std::vector<int> vector1{1,1,1,1,2,3,2,2,3,3,1};
auto func = [](const auto& vec){return;};
parallel_for_each_unique_permutation(vector1, func);
}
您必须使用的排列在组合学领域被称为多重排列.
它们被描述为例如on The Combinatorial Object Server 在 this paper by professor Tadao Takaoka.
中有更详细的解释您有 some related Python code and some C++ code in the FXT open source library.
您可以考虑在您的问题中添加 "multiset" 和 "combinatorics" 标签。
一种可能性是从 FXT 库中借用 (header-only) algorithmic code,它为那些 多集排列 [=81] 提供了一个简单的生成器 class =].
性能等级:
在 15 objects,{1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}
的测试向量上使用 FXT 算法,可以在普通 vanilla Intel x86 上在不到 2 秒的时间内生成所有关联的 12,612,600 "permutations" -64机;这没有诊断文本 I/O 并且没有任何优化尝试。
该算法只生成所需的那些 "permutations",仅此而已。因此不再需要生成全部 15 个! "raw" 排列,也不使用互斥来更新共享数据结构以进行过滤。
用于生成排列的适配器 class:
我将在下面尝试为适配器 class 提供代码,它允许您的应用程序使用 FXT 算法,同时将依赖项包含在单个实现文件中。这样,代码将有望更好地适合您的应用程序。想想 FXT 的 ulong 类型和原始指针的使用,而不是代码中的 std::vector<std::size_t>
。此外,FXT 是一个非常广泛的库。
Header 文件 "adaptor" class:
// File: MSetPermGen.h
#ifndef MSET_PERM_GEN_H
#define MSET_PERM_GEN_H
#include <iostream>
#include <vector>
class MSetPermGenImpl; // from algorithmic backend
using IntVec = std::vector<int>;
using SizeVec = std::vector<std::size_t>;
// Generator class for multiset permutations:
class MSetPermGen {
public:
MSetPermGen(const IntVec& vec);
std::size_t getCycleLength() const;
bool forward(size_t incr);
bool next();
const SizeVec& getPermIndices() const;
const IntVec& getItems() const;
const IntVec& getItemValues() const;
private:
std::size_t cycleLength_;
MSetPermGenImpl* genImpl_; // implementation generator
IntVec itemValues_; // only once each
IntVec items_; // copy of ctor argument
SizeVec freqs_; // repetition counts
SizeVec state_; // array of indices in 0..n-1
};
#endif
class 构造函数完全采用主程序中提供的参数类型。当然,关键方法是next()
。您还可以使用 forward(incr)
方法一次将自动机移动几步。
示例客户端程序:
// File: test_main.cpp
#include <cassert>
#include "MSetPermGen.h"
using std::cout;
using std::cerr;
using std::endl;
// utility functions:
std::vector<int> getMSPermutation(const MSetPermGen& mspg)
{
std::vector<int> res;
auto indices = mspg.getPermIndices(); // always between 0 and n-1
auto values = mspg.getItemValues(); // whatever the user put in
std::size_t n = indices.size();
assert( n == items.size() );
res.reserve(n);
for (std::size_t i=0; i < n; i++) {
auto xi = indices[i];
res.push_back(values[xi]);
}
return res;
}
void printPermutation(const std::vector<int>& p, std::ostream& fh)
{
std::size_t n = p.size();
for (size_t i=0; i < n; i++)
fh << p[i] << " ";
fh << '\n';
}
int main(int argc, const char* argv[])
{
std::vector<int> vec0{1,1, 2,2,2}; // N=5
std::vector<int> vec1{1,1, 1,1, 2, 3, 2,2, 3,3, 1}; // N=11
std::vector<int> vec2{1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}; // N=15
MSetPermGen pg0{vec0};
MSetPermGen pg1{vec1};
MSetPermGen pg2{vec2};
auto pg = &pg0; // choice of 0, 1, 2 for sizing
auto cl = pg->getCycleLength();
auto permA = getMSPermutation(*pg);
printPermutation(permA, cout);
for (std::size_t pi=0; pi < (cl-1); pi++) {
pg->next();
auto permB = getMSPermutation(*pg);
printPermutation(permB, cout);
}
return EXIT_SUCCESS;
}
上述小程序的文本输出:
1 1 2 2 2
1 2 1 2 2
1 2 2 1 2
1 2 2 2 1
2 1 1 2 2
2 1 2 1 2
2 1 2 2 1
2 2 1 1 2
2 2 1 2 1
2 2 2 1 1
您只能从矢量 {1,1, 2,2,2} 中获得 10 个项目,因为 5! / (2! * 3!) = 120/(2*6) = 10.
适配器 class、MSetPermGen.cpp
的实现文件由两部分组成。第一部分是经过最少改编的 FXT 代码。第二部分是 MSetPermGen
class 正确的。
实现文件的第一部分:
// File: MSetPermGen.cpp - part 1 of 2 - FXT code
// -------------- Beginning of header-only FXT combinatorics code -----------
// This file is part of the FXT library.
// Copyright (C) 2010, 2012, 2014 Joerg Arndt
// License: GNU General Public License version 3 or later,
// see the file COPYING.txt in the main directory.
//-- https://www.jjj.de/fxt/
//-- https://fossies.org/dox/fxt-2018.07.03/mset-perm-lex_8h_source.html
#include <cstddef>
using ulong = std::size_t;
inline void swap2(ulong& xa, ulong& xb)
{
ulong save_xb = xb;
xb = xa;
xa = save_xb;
}
class mset_perm_lex
// Multiset permutations in lexicographic order, iterative algorithm.
{
public:
ulong k_; // number of different sorts of objects
ulong *r_; // number of elements '0' in r[0], '1' in r[1], ..., 'k-1' in r[k-1]
ulong n_; // number of objects
ulong *ms_; // multiset data in ms[0], ..., ms[n-1], sentinels at [-1] and [-2]
private: // have pointer data
mset_perm_lex(const mset_perm_lex&); // forbidden
mset_perm_lex & operator = (const mset_perm_lex&); // forbidden
public:
explicit mset_perm_lex(const ulong *r, ulong k)
{
k_ = k;
r_ = new ulong[k];
for (ulong j=0; j<k_; ++j) r_[j] = r[j]; // get buckets
n_ = 0;
for (ulong j=0; j<k_; ++j) n_ += r_[j];
ms_ = new ulong[n_+2];
ms_[0] = 0; ms_[1] = 1; // sentinels: ms[0] < ms[1]
ms_ += 2; // nota bene
first();
}
void first()
{
for (ulong j=0, i=0; j<k_; ++j)
for (ulong h=r_[j]; h!=0; --h, ++i)
ms_[i] = j;
}
~mset_perm_lex()
{
ms_ -= 2;
delete [] ms_;
delete [] r_;
}
const ulong * data() const { return ms_; }
ulong next()
// Return position of leftmost change,
// return n with last permutation.
{
// find rightmost pair with ms[i] < ms[i+1]:
const ulong n1 = n_ - 1;
ulong i = n1;
do { --i; } while ( ms_[i] >= ms_[i+1] ); // can read sentinel
if ( (long)i < 0 ) return n_; // last sequence is falling seq.
// find rightmost element ms[j] less than ms[i]:
ulong j = n1;
while ( ms_[i] >= ms_[j] ) { --j; }
swap2(ms_[i], ms_[j]);
// Here the elements ms[i+1], ..., ms[n-1] are a falling sequence.
// Reverse order to the right:
ulong r = n1;
ulong s = i + 1;
while ( r > s ) { swap2(ms_[r], ms_[s]); --r; ++s; }
return i;
}
};
// -------------- End of header-only FXT combinatorics code -----------
class 实现文件的第二部分:
// Second part of file MSetPermGen.cpp: non-FXT code
#include <cassert>
#include <tuple>
#include <map>
#include <iostream>
#include <cstdio>
#include "MSetPermGen.h"
using std::cout;
using std::cerr;
using std::endl;
class MSetPermGenImpl { // wrapper class
public:
MSetPermGenImpl(const SizeVec& freqs) : fg(freqs.data(), freqs.size())
{}
private:
mset_perm_lex fg;
friend class MSetPermGen;
};
static std::size_t fact(size_t n)
{
std::size_t f = 1;
for (std::size_t i = 1; i <= n; i++)
f = f*i;
return f;
}
MSetPermGen::MSetPermGen(const IntVec& vec) : items_(vec)
{
std::map<int,int> ma;
for (int i: vec) {
ma[i]++;
}
int item, freq;
for (const auto& p : ma) {
std::tie(item, freq) = p;
itemValues_.push_back(item);
freqs_.push_back(freq);
}
cycleLength_ = fact(items_.size());
for (auto i: freqs_)
cycleLength_ /= fact(i);
// create FXT-level generator:
genImpl_ = new MSetPermGenImpl(freqs_);
for (std::size_t i=0; i < items_.size(); i++)
state_.push_back(genImpl_->fg.ms_[i]);
}
std::size_t MSetPermGen::getCycleLength() const
{
return cycleLength_;
}
bool MSetPermGen::forward(size_t incr)
{
std::size_t n = items_.size();
std::size_t rc = 0;
// move forward state by brute force, could be improved:
for (std::size_t i=0; i < incr; i++)
rc = genImpl_->fg.next();
for (std::size_t j=0; j < n; j++)
state_[j] = genImpl_->fg.ms_[j];
return (rc != n);
}
bool MSetPermGen::next()
{
return forward(1);
}
const SizeVec& MSetPermGen::getPermIndices() const
{
return (this->state_);
}
const IntVec& MSetPermGen::getItems() const
{
return (this->items_);
}
const IntVec& MSetPermGen::getItemValues() const
{
return (this->itemValues_);
}
调整并行应用程序:
关于您的多线程应用程序,鉴于生成 "permutations" 的成本很低,您可以负担得起为每个线程创建一个生成器 object。
在启动实际计算之前,将每个生成器转发到其适当的初始位置,即步骤 thread_id * (cycleLength / num_threads)
。
我已尝试按照这些思路使您的代码适应此 MSetPermGen
class。请参阅下面的代码。
使用 3 个线程、大小为 15 的输入向量 {1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}
(提供 12,612,600 个排列)并启用所有诊断,修改后的并行程序运行时间不到 10 秒;关闭所有诊断后不到 2 秒。
修改后的并行程序:
#include <algorithm>
#include <thread>
#include <vector>
#include <atomic>
#include <mutex>
#include <numeric>
#include <set>
#include <iostream>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include "MSetPermGen.h"
using std::cout;
using std::endl;
// debug and instrumentation:
static std::atomic<size_t> permCounter;
static bool doManagePermCounter = true;
static bool doThreadLogfiles = true;
static bool doLogfileHeaders = true;
template<class Container, class Func>
void parallel_for_each_permutation(const Container& container, int numThreads, Func mfunc) {
MSetPermGen gen0(container);
std::size_t totalNumPermutations = gen0.getCycleLength();
std::size_t permShare = totalNumPermutations / numThreads;
if ((totalNumPermutations % numThreads) != 0)
permShare++;
std::cout << "totalNumPermutations: " << totalNumPermutations << std::endl;
std::vector<std::thread> threads;
for (int threadId = 0; threadId < numThreads; threadId++) {
threads.emplace_back([&, threadId]() {
// generate some per-thread logfile name
std::ostringstream fnss;
fnss << "thrlog_" << threadId << ".txt";
std::string fileName = fnss.str();
std::ofstream fh(fileName);
MSetPermGen thrGen(container);
const std::size_t firstPerm = permShare * threadId;
thrGen.forward(firstPerm);
const std::size_t last_excl = std::min(totalNumPermutations,
(threadId+1) * permShare);
if (doLogfileHeaders) {
fh << "MSG threadId: " << threadId << '\n';
fh << "MSG firstPerm: " << firstPerm << '\n';
fh << "MSG lastExcl : " << last_excl << '\n';
}
Container permutation(container);
auto values = thrGen.getItemValues();
auto permIndices = thrGen.getPermIndices();
auto nsz = permIndices.size();
std::size_t count = firstPerm;
do {
for (std::size_t i = 0; i < nsz; i++) {
permutation[i] = values[permIndices[i]];
}
mfunc(threadId, permutation);
if (doThreadLogfiles) {
for (std::size_t i = 0; i < nsz; i++)
fh << permutation[i] << ' ';
fh << '\n';
}
thrGen.next();
permIndices = thrGen.getPermIndices();
++count;
if (doManagePermCounter) {
permCounter++;
}
} while (count < last_excl);
fh.close();
});
}
for(auto& thread : threads)
thread.join();
}
template<class Container, class Func>
void parallel_for_each_unique_permutation(const Container& container, Func func) {
constexpr int numThreads = 3;
parallel_for_each_permutation(
container,
numThreads,
[&](int threadId, const auto& permutation){
// no longer need any mutual exclusion
func(permutation);
}
);
}
int main()
{
std::vector<int> vector1{1,1,1,1,2,3,2,2,3,3,1}; // N=11
std::vector<int> vector0{1,1, 2,2,2}; // N=5
std::vector<int> vector2{1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}; // N=15
auto func = [](const auto& vec) { return; };
permCounter.store(0);
parallel_for_each_unique_permutation(vector2, func);
auto finalPermCounter = permCounter.load();
cout << "FinalPermCounter = " << finalPermCounter << endl;
}