UTF8 字符串转 int
UTF8 string to int
我一直在努力尝试从 UTF8 文件中提取一个整数:
#include <iostream>
#include <fstream>
#include <sstream>
using namespace std;
int main()
{
ifstream file("UTF8.txt");
if(file.is_open())
{
string line;
getline(file, line);
istringstream ss(line);
int a;
ss >> a;
if(ss.fail())
{
cout << "Error parsing" << endl;
ss.clear();
}
getline(file, line);
cout << a << endl << line << endl;
file.close();
}
}
该文件包含两行:“42”和“è_é”,并在记事本中保存为 UTF8。当文件是 ANSI 时,上面的方法有效,但当它是 Unicode 时失败。我尝试了很多方法,最有希望的是设置语言环境,但我希望程序独立于计算机的语言环境(即即使 PC 是美国的也能读取汉字)。
老实说,我现在没主意了。如果可能,我想避免使用 Qt 中的 CString。
更新
下面显示“0”,"Error parsing"因为文件开头有一个奇怪的字符。一个空行,在读取时被丢弃,就在数字使它起作用之前,但我无法在最终程序中更改文件。控制台中的口音显示不正确,但是当我将输出写入文件时,一切都很好,这就是我所需要的。所以这只是文件开头的问题!
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>
int main()
{
std::ifstream file("UTF8.srt");
file.imbue(std::locale(file.getloc(),
new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
if (file.is_open()) {
std::string line;
std::getline(file,line);
std::istringstream ss{line};
int a;
ss >> a;
if (ss.fail()) {
std::cout << "Error parsing" << std::endl;
ss.clear();
}
getline(file,line);
std::cout << a << std::endl << line << std::endl;
file.close();
}
}
解决方案
以下作品,输入文件内容如下:
5
bla bla é_è
6
truc è_é
代码:
#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>
// Do not get used to it:
// using namespace std;
inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
s += 3;
return s;
}
int main()
{
std::ifstream file("UTF8.txt");
std::ofstream fileO("UTF8_copy.txt");
if(!file || !fileO) {
std::cout << "Error opening files" << std::endl;
}
else {
std::string line;
//Parse the first number
std::getline(file, line);
{
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
std::cout << "Number 1: " << a << std::endl;
fileO << a << std::endl;
}
//Copy the following line as is
std::getline(file, line);
fileO << line << std::endl;
//Discard empty line, copy it in the output file
std::getline(file, line);
fileO << std::endl;
//Parse the second number
std::getline(file, line);
{
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
std::cout << "Number 1: " << a << std::endl;
fileO << a << std::endl;
}
//Copy the following line as is
std::getline(file, line);
fileO << line << std::endl;
file.close();
fileO.close();
}
return 0;
}
以上 link 中的示例:
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
int main()
{
// UTF-8 data with BOM
std::ofstream("text.txt") << u8"\ufeffz\u6c34\U0001d10b";
// read the UTF8 file, skipping the BOM
std::wifstream fin("text.txt");
fin.imbue(std::locale(fin.getloc(),
new std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header>));
for (wchar_t c; fin.get(c); )
std::cout << std::hex << std::showbase << c << '\n';
}
注意 std::consume_header
设置。
适应你的问题可能是:
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>
int main()
{
std::ifstream file("UTF8.txt");
file.imbue(std::locale(file.getloc(),
new std::codecvt_utf8<char,0x10ffff,std::consume_header>));
if (file.is_open()) {
std::string line;
std::getline(file,line);
std::istringstream ss{line};
int a;
ss >> a;
if (ss.fail()) {
std::cout << "Error parsing" << std::endl;
ss.clear();
}
getline(file,line);
std::cout << a << std::endl << line << std::endl;
file.close();
}
}
或 wchar_t
:
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>
int main()
{
std::wifstream file("UTF8.txt");
file.imbue(std::locale(file.getloc(),
new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
if (file.is_open()) {
std::wstring line;
std::getline(file,line);
std::wistringstream ss{line};
int a;
ss >> a;
if (ss.fail()) {
std::wcout << L"Error parsing" << std::endl;
ss.clear();
}
std::getline(file,line);
std::wcout << a << std::endl << line << std::endl;
file.close();
}
}
只需跳过前导 BOM(字节顺序标记):
#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>
// Do not get used to it:
// using namespace std;
inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
s += 3;
return s;
}
int main()
{
std::istringstream file(u8"\xEF\xBB\xBF""42\n\u00E8_\u00E9\n");
std::string line;
getline(file, line);
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
getline(file, line);
std::cout << a << std::endl << line << std::endl;
}
我一直在努力尝试从 UTF8 文件中提取一个整数:
#include <iostream>
#include <fstream>
#include <sstream>
using namespace std;
int main()
{
ifstream file("UTF8.txt");
if(file.is_open())
{
string line;
getline(file, line);
istringstream ss(line);
int a;
ss >> a;
if(ss.fail())
{
cout << "Error parsing" << endl;
ss.clear();
}
getline(file, line);
cout << a << endl << line << endl;
file.close();
}
}
该文件包含两行:“42”和“è_é”,并在记事本中保存为 UTF8。当文件是 ANSI 时,上面的方法有效,但当它是 Unicode 时失败。我尝试了很多方法,最有希望的是设置语言环境,但我希望程序独立于计算机的语言环境(即即使 PC 是美国的也能读取汉字)。 老实说,我现在没主意了。如果可能,我想避免使用 Qt 中的 CString。
更新
下面显示“0”,"Error parsing"因为文件开头有一个奇怪的字符。一个空行,在读取时被丢弃,就在数字使它起作用之前,但我无法在最终程序中更改文件。控制台中的口音显示不正确,但是当我将输出写入文件时,一切都很好,这就是我所需要的。所以这只是文件开头的问题!
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>
int main()
{
std::ifstream file("UTF8.srt");
file.imbue(std::locale(file.getloc(),
new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
if (file.is_open()) {
std::string line;
std::getline(file,line);
std::istringstream ss{line};
int a;
ss >> a;
if (ss.fail()) {
std::cout << "Error parsing" << std::endl;
ss.clear();
}
getline(file,line);
std::cout << a << std::endl << line << std::endl;
file.close();
}
}
解决方案
以下作品,输入文件内容如下:
5
bla bla é_è
6
truc è_é
代码:
#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>
// Do not get used to it:
// using namespace std;
inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
s += 3;
return s;
}
int main()
{
std::ifstream file("UTF8.txt");
std::ofstream fileO("UTF8_copy.txt");
if(!file || !fileO) {
std::cout << "Error opening files" << std::endl;
}
else {
std::string line;
//Parse the first number
std::getline(file, line);
{
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
std::cout << "Number 1: " << a << std::endl;
fileO << a << std::endl;
}
//Copy the following line as is
std::getline(file, line);
fileO << line << std::endl;
//Discard empty line, copy it in the output file
std::getline(file, line);
fileO << std::endl;
//Parse the second number
std::getline(file, line);
{
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
std::cout << "Number 1: " << a << std::endl;
fileO << a << std::endl;
}
//Copy the following line as is
std::getline(file, line);
fileO << line << std::endl;
file.close();
fileO.close();
}
return 0;
}
以上 link 中的示例:
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
int main()
{
// UTF-8 data with BOM
std::ofstream("text.txt") << u8"\ufeffz\u6c34\U0001d10b";
// read the UTF8 file, skipping the BOM
std::wifstream fin("text.txt");
fin.imbue(std::locale(fin.getloc(),
new std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header>));
for (wchar_t c; fin.get(c); )
std::cout << std::hex << std::showbase << c << '\n';
}
注意 std::consume_header
设置。
适应你的问题可能是:
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>
int main()
{
std::ifstream file("UTF8.txt");
file.imbue(std::locale(file.getloc(),
new std::codecvt_utf8<char,0x10ffff,std::consume_header>));
if (file.is_open()) {
std::string line;
std::getline(file,line);
std::istringstream ss{line};
int a;
ss >> a;
if (ss.fail()) {
std::cout << "Error parsing" << std::endl;
ss.clear();
}
getline(file,line);
std::cout << a << std::endl << line << std::endl;
file.close();
}
}
或 wchar_t
:
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>
int main()
{
std::wifstream file("UTF8.txt");
file.imbue(std::locale(file.getloc(),
new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
if (file.is_open()) {
std::wstring line;
std::getline(file,line);
std::wistringstream ss{line};
int a;
ss >> a;
if (ss.fail()) {
std::wcout << L"Error parsing" << std::endl;
ss.clear();
}
std::getline(file,line);
std::wcout << a << std::endl << line << std::endl;
file.close();
}
}
只需跳过前导 BOM(字节顺序标记):
#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>
// Do not get used to it:
// using namespace std;
inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
s += 3;
return s;
}
int main()
{
std::istringstream file(u8"\xEF\xBB\xBF""42\n\u00E8_\u00E9\n");
std::string line;
getline(file, line);
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
getline(file, line);
std::cout << a << std::endl << line << std::endl;
}