无符号数据类型的 MSVC ifstream 性能问题
MSVC ifstream performance issue with unsigned datatype
我在读取二进制文件时用 std::ifstream
在 MSVC 上做了一些测试。我在 char
和 unsigned char
数据类型之间有很大的性能差异。
读取 512 MB 二进制文件时的结果:
Duration read as signed: 322 ms
Duration read as unsigned: 10552 ms
下面是我用来测试的代码:
#include <vector>
#include <iostream>
#include <fstream>
#include <chrono>
#include <limits>
#include <filesystem>
int main()
{
const std::filesystem::path filePath{ "test.data" }; // 512 MB binary file
const size_t fileSize{ std::filesystem::file_size(filePath) };
{
std::basic_ifstream<char> fileStream{ filePath, std::fstream::binary };
std::vector<char> data;
data.resize(fileSize);
const auto start{ std::chrono::system_clock::now() };
fileStream.read(data.data(), fileSize);
const auto end{ std::chrono::system_clock::now() };
std::cout << "Duration read as signed: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
}
{
std::basic_ifstream<unsigned char> fileStream{ filePath, std::fstream::binary };
std::vector<unsigned char> data;
data.resize(fileSize);
const auto start{ std::chrono::system_clock::now() };
fileStream.read(data.data(), fileSize);
const auto end{ std::chrono::system_clock::now() };
std::cout << "Duration read as unsigned: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
}
return 0;
}
我不明白为什么在读取二进制文件时使用 basic_ifstream<unsigned char>
比 basic_ifstream<char>
慢 30 倍。
我已经追踪到 C:\Program Files (x86)\Microsoft Visual Studio19\Community\VC\Tools\MSVC.29.30037\include\fstream
文件,第 549 行:
virtual streamsize __CLR_OR_THIS_CALL xsgetn(_Elem* _Ptr, streamsize _Count) override {
// get _Count characters from stream
if constexpr (sizeof(_Elem) == 1) {
if (_Count <= 0) {
return 0;
}
if (_Pcvt) { // if we need a nontrivial codecvt transform, do the default expensive thing
return _Mysb::xsgetn(_Ptr, _Count);
}
对于 unsigned char
它进入 default expensive thing
再看远一点,我看到了这个:
virtual streamsize __CLR_OR_THIS_CALL xsgetn(_Elem* _Ptr, streamsize _Count) { // get _Count characters from stream
const streamsize _Start_count = _Count;
while (0 < _Count) {
streamsize _Size = _Gnavail();
if (0 < _Size) { // copy from read buffer
if (_Count < _Size) {
_Size = _Count;
}
_Traits::copy(_Ptr, gptr(), static_cast<size_t>(_Size));
_Ptr += _Size;
_Count -= _Size;
gbump(static_cast<int>(_Size));
} else {
const int_type _Meta = uflow();
if (_Traits::eq_int_type(_Traits::eof(), _Meta)) {
break; // end of file, quit
}
// get a single character
*_Ptr++ = _Traits::to_char_type(_Meta);
--_Count;
}
}
return _Start_count - _Count;
}
备注一一处理!而且那个函数作用不大:
_NODISCARD static constexpr _Elem to_char_type(const int_type& _Meta) noexcept {
return static_cast<_Elem>(_Meta);
}
当您像这样设置读取缓冲区时,性能问题消失了:
{
std::basic_ifstream<unsigned char> fileStream{ filePath, std::fstream::binary };
std::vector<unsigned char> data;
data.resize(fileSize);
unsigned char buf[8192U];
fileStream.rdbuf()->pubsetbuf(buf, 8192U);
const auto start{ std::chrono::system_clock::now() };
fileStream.read(data.data(), fileSize);
const auto end{ std::chrono::system_clock::now() };
std::cout << "Duration read unsigned with buffer: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
}
结果:
Duration read signed: 331 ms
Duration read unsigned: 10505 ms
Duration read unsigned with buffer: 223 ms
我在读取二进制文件时用 std::ifstream
在 MSVC 上做了一些测试。我在 char
和 unsigned char
数据类型之间有很大的性能差异。
读取 512 MB 二进制文件时的结果:
Duration read as signed: 322 ms
Duration read as unsigned: 10552 ms
下面是我用来测试的代码:
#include <vector>
#include <iostream>
#include <fstream>
#include <chrono>
#include <limits>
#include <filesystem>
int main()
{
const std::filesystem::path filePath{ "test.data" }; // 512 MB binary file
const size_t fileSize{ std::filesystem::file_size(filePath) };
{
std::basic_ifstream<char> fileStream{ filePath, std::fstream::binary };
std::vector<char> data;
data.resize(fileSize);
const auto start{ std::chrono::system_clock::now() };
fileStream.read(data.data(), fileSize);
const auto end{ std::chrono::system_clock::now() };
std::cout << "Duration read as signed: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
}
{
std::basic_ifstream<unsigned char> fileStream{ filePath, std::fstream::binary };
std::vector<unsigned char> data;
data.resize(fileSize);
const auto start{ std::chrono::system_clock::now() };
fileStream.read(data.data(), fileSize);
const auto end{ std::chrono::system_clock::now() };
std::cout << "Duration read as unsigned: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
}
return 0;
}
我不明白为什么在读取二进制文件时使用 basic_ifstream<unsigned char>
比 basic_ifstream<char>
慢 30 倍。
我已经追踪到 C:\Program Files (x86)\Microsoft Visual Studio19\Community\VC\Tools\MSVC.29.30037\include\fstream
文件,第 549 行:
virtual streamsize __CLR_OR_THIS_CALL xsgetn(_Elem* _Ptr, streamsize _Count) override {
// get _Count characters from stream
if constexpr (sizeof(_Elem) == 1) {
if (_Count <= 0) {
return 0;
}
if (_Pcvt) { // if we need a nontrivial codecvt transform, do the default expensive thing
return _Mysb::xsgetn(_Ptr, _Count);
}
对于 unsigned char
它进入 default expensive thing
再看远一点,我看到了这个:
virtual streamsize __CLR_OR_THIS_CALL xsgetn(_Elem* _Ptr, streamsize _Count) { // get _Count characters from stream
const streamsize _Start_count = _Count;
while (0 < _Count) {
streamsize _Size = _Gnavail();
if (0 < _Size) { // copy from read buffer
if (_Count < _Size) {
_Size = _Count;
}
_Traits::copy(_Ptr, gptr(), static_cast<size_t>(_Size));
_Ptr += _Size;
_Count -= _Size;
gbump(static_cast<int>(_Size));
} else {
const int_type _Meta = uflow();
if (_Traits::eq_int_type(_Traits::eof(), _Meta)) {
break; // end of file, quit
}
// get a single character
*_Ptr++ = _Traits::to_char_type(_Meta);
--_Count;
}
}
return _Start_count - _Count;
}
备注一一处理!而且那个函数作用不大:
_NODISCARD static constexpr _Elem to_char_type(const int_type& _Meta) noexcept {
return static_cast<_Elem>(_Meta);
}
当您像这样设置读取缓冲区时,性能问题消失了:
{
std::basic_ifstream<unsigned char> fileStream{ filePath, std::fstream::binary };
std::vector<unsigned char> data;
data.resize(fileSize);
unsigned char buf[8192U];
fileStream.rdbuf()->pubsetbuf(buf, 8192U);
const auto start{ std::chrono::system_clock::now() };
fileStream.read(data.data(), fileSize);
const auto end{ std::chrono::system_clock::now() };
std::cout << "Duration read unsigned with buffer: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
}
结果:
Duration read signed: 331 ms
Duration read unsigned: 10505 ms
Duration read unsigned with buffer: 223 ms