std::thread 到 std::async 会带来巨大的性能提升。怎么可能?
std::thread to std::async makes HUGE performance gain. How it can be possible?
我在 std::thread 和 std::async 之间做了一个测试代码。
#include <iostream>
#include <mutex>
#include <fstream>
#include <string>
#include <memory>
#include <thread>
#include <future>
#include <functional>
#include <boost/noncopyable.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/filesystem.hpp>
#include <boost/date_time/posix_time/posix_time.hpp>
#include <boost/asio.hpp>
namespace fs = boost::filesystem;
namespace pt = boost::posix_time;
namespace as = boost::asio;
class Log : private boost::noncopyable
{
public:
void LogPath(const fs::path& filePath) {
boost::system::error_code ec;
if(fs::exists(filePath, ec)) {
fs::remove(filePath);
}
this->ofStreamPtr_.reset(new fs::ofstream(filePath));
};
void WriteLog(std::size_t i) {
assert(*this->ofStreamPtr_);
std::lock_guard<std::mutex> lock(this->logMutex_);
*this->ofStreamPtr_ << "Hello, World! " << i << "\n";
};
private:
std::mutex logMutex_;
std::unique_ptr<fs::ofstream> ofStreamPtr_;
};
int main(int argc, char *argv[]) {
if(argc != 2) {
std::cout << "Wrong argument" << std::endl;
exit(1);
}
std::size_t iter_count = boost::lexical_cast<std::size_t>(argv[1]);
Log log;
log.LogPath("log.txt");
std::function<void(std::size_t)> func = std::bind(&Log::WriteLog, &log, std::placeholders::_1);
auto start_time = pt::microsec_clock::local_time();
////// Version 1: use std::thread //////
// {
// std::vector<std::shared_ptr<std::thread> > threadList;
// threadList.reserve(iter_count);
// for(std::size_t i = 0; i < iter_count; i++) {
// threadList.push_back(
// std::make_shared<std::thread>(func, i));
// }
//
// for(auto it: threadList) {
// it->join();
// }
// }
// pt::time_duration duration = pt::microsec_clock::local_time() - start_time;
// std::cout << "Version 1: " << duration << std::endl;
////// Version 2: use std::async //////
start_time = pt::microsec_clock::local_time();
{
for(std::size_t i = 0; i < iter_count; i++) {
auto result = std::async(func, i);
}
}
duration = pt::microsec_clock::local_time() - start_time;
std::cout << "Version 2: " << duration << std::endl;
////// Version 3: use boost::asio::io_service //////
// start_time = pt::microsec_clock::local_time();
// {
// as::io_service ioService;
// as::io_service::strand strand{ioService};
// {
// for(std::size_t i = 0; i < iter_count; i++) {
// strand.post(std::bind(func, i));
// }
// }
// ioService.run();
// }
// duration = pt::microsec_clock::local_time() - start_time;
// std::cout << "Version 3: " << duration << std::endl;
}
使用 4 核 CentOS 7 box(gcc 4.8.5),版本 1(使用 std::thread)比其他实现慢大约 100 倍。
Iteration Version1 Version2 Version3
100 0.0034s 0.000051s 0.000066s
1000 0.038s 0.00029s 0.00058s
10000 0.41s 0.0042s 0.0059s
100000 throw 0.026s 0.061s
为什么线程版本这么慢?我认为每个线程都不会花费很长时间来完成 Log::WriteLog
功能。
函数可能永远不会被调用。您没有在版本 2 中传递 std::launch
政策,因此您依赖 the default behavior of std::async
(强调我的):
Behaves the same as async(std::launch::async | std::launch::deferred, f, args...)
. In other words, f
may be executed in another thread or it may be run synchronously when the resulting std::future
is queried for a value.
通过以下小改动尝试重新运行您的基准:
auto result = std::async(std::launch::async, func, i);
或者,您可以在第二个循环中对每个 std::future
调用 result.wait()
,类似于您在版本 1 中对所有线程调用 join()
的方式。这会强制评估std::future
.
请注意,此基准测试存在一个不相关的主要问题。 func
在函数调用的整个过程中立即获取锁,这使得并行性成为不可能。在这里使用线程没有任何优势 - 我怀疑它会比串行实现慢得多(由于线程创建和锁定开销)。
我在 std::thread 和 std::async 之间做了一个测试代码。
#include <iostream>
#include <mutex>
#include <fstream>
#include <string>
#include <memory>
#include <thread>
#include <future>
#include <functional>
#include <boost/noncopyable.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/filesystem.hpp>
#include <boost/date_time/posix_time/posix_time.hpp>
#include <boost/asio.hpp>
namespace fs = boost::filesystem;
namespace pt = boost::posix_time;
namespace as = boost::asio;
class Log : private boost::noncopyable
{
public:
void LogPath(const fs::path& filePath) {
boost::system::error_code ec;
if(fs::exists(filePath, ec)) {
fs::remove(filePath);
}
this->ofStreamPtr_.reset(new fs::ofstream(filePath));
};
void WriteLog(std::size_t i) {
assert(*this->ofStreamPtr_);
std::lock_guard<std::mutex> lock(this->logMutex_);
*this->ofStreamPtr_ << "Hello, World! " << i << "\n";
};
private:
std::mutex logMutex_;
std::unique_ptr<fs::ofstream> ofStreamPtr_;
};
int main(int argc, char *argv[]) {
if(argc != 2) {
std::cout << "Wrong argument" << std::endl;
exit(1);
}
std::size_t iter_count = boost::lexical_cast<std::size_t>(argv[1]);
Log log;
log.LogPath("log.txt");
std::function<void(std::size_t)> func = std::bind(&Log::WriteLog, &log, std::placeholders::_1);
auto start_time = pt::microsec_clock::local_time();
////// Version 1: use std::thread //////
// {
// std::vector<std::shared_ptr<std::thread> > threadList;
// threadList.reserve(iter_count);
// for(std::size_t i = 0; i < iter_count; i++) {
// threadList.push_back(
// std::make_shared<std::thread>(func, i));
// }
//
// for(auto it: threadList) {
// it->join();
// }
// }
// pt::time_duration duration = pt::microsec_clock::local_time() - start_time;
// std::cout << "Version 1: " << duration << std::endl;
////// Version 2: use std::async //////
start_time = pt::microsec_clock::local_time();
{
for(std::size_t i = 0; i < iter_count; i++) {
auto result = std::async(func, i);
}
}
duration = pt::microsec_clock::local_time() - start_time;
std::cout << "Version 2: " << duration << std::endl;
////// Version 3: use boost::asio::io_service //////
// start_time = pt::microsec_clock::local_time();
// {
// as::io_service ioService;
// as::io_service::strand strand{ioService};
// {
// for(std::size_t i = 0; i < iter_count; i++) {
// strand.post(std::bind(func, i));
// }
// }
// ioService.run();
// }
// duration = pt::microsec_clock::local_time() - start_time;
// std::cout << "Version 3: " << duration << std::endl;
}
使用 4 核 CentOS 7 box(gcc 4.8.5),版本 1(使用 std::thread)比其他实现慢大约 100 倍。
Iteration Version1 Version2 Version3 100 0.0034s 0.000051s 0.000066s 1000 0.038s 0.00029s 0.00058s 10000 0.41s 0.0042s 0.0059s 100000 throw 0.026s 0.061s
为什么线程版本这么慢?我认为每个线程都不会花费很长时间来完成 Log::WriteLog
功能。
函数可能永远不会被调用。您没有在版本 2 中传递 std::launch
政策,因此您依赖 the default behavior of std::async
(强调我的):
Behaves the same as
async(std::launch::async | std::launch::deferred, f, args...)
. In other words,f
may be executed in another thread or it may be run synchronously when the resultingstd::future
is queried for a value.
通过以下小改动尝试重新运行您的基准:
auto result = std::async(std::launch::async, func, i);
或者,您可以在第二个循环中对每个 std::future
调用 result.wait()
,类似于您在版本 1 中对所有线程调用 join()
的方式。这会强制评估std::future
.
请注意,此基准测试存在一个不相关的主要问题。 func
在函数调用的整个过程中立即获取锁,这使得并行性成为不可能。在这里使用线程没有任何优势 - 我怀疑它会比串行实现慢得多(由于线程创建和锁定开销)。