C++11 中的 UTF 转换函数
UTF conversion functions in C++11
我正在寻找用于在 C++11 中执行 UTF 字符转换的函数集合。它应该包括与 utf8、utf16 和 utf32 之间的任何转换。识别字节顺序标记的功能也会有所帮助。
更新:此处列出的功能在 GitHub 存储库 .hpp, .cpp and tests 中维护。某些 UTF-16 函数已被禁用,因为它们无法正常工作。 utf.test.cpp 文件中的 "banana" 测试证明了这个问题。
还包含一个 "read_with_bom" 识别字节顺序标记的函数。
#if _MSC_VER == 1900 //work around for bug in MS Visual C++ 2015 https://social.msdn.microsoft.com/Forums/en-US/8f40dcd8-c67f-4eba-9134-a19b9178e481/vs-2015-rc-linker-stdcodecvt-error?forum=vcgeneral
std::string to_utf8(const std::u16string &s)
{
std::wstring_convert<std::codecvt_utf8<int16_t>, int16_t> convert;
auto p = reinterpret_cast<const int16_t *>(s.data());
return convert.to_bytes(p, p + s.size());
}
std::string to_utf8(const std::u32string &s)
{
std::wstring_convert<std::codecvt_utf8<int32_t>, int32_t> convert;
auto p = reinterpret_cast<const int32_t *>(s.data());
return convert.to_bytes(p, p + s.size());
}
std::u16string to_utf16(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<int16_t>, int16_t> convert;
auto asInt = convert.from_bytes(s);
return std::u16string(reinterpret_cast<char16_t const *>(asInt.data()), asInt.length());
}
std::u32string to_utf32(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<int32_t>, int32_t> convert;
auto asInt = convert.from_bytes(s);
return std::u32string(reinterpret_cast<char32_t const *>(asInt.data()), asInt.length());
}
#else
std::string to_utf8(const std::u16string &s)
{
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> conv;
return conv.to_bytes(s);
}
std::string to_utf8(const std::u32string &s)
{
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
return conv.to_bytes(s);
}
std::u16string to_utf16(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> convert;
return convert.from_bytes(s);
}
std::u32string to_utf32(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
return conv.from_bytes(s);
}
#endif
std::u16string to_utf16(const std::u32string &s)
{
return to_utf16(to_utf8(s));
}
std::u32string to_utf32(const std::u16string &s) {
return to_utf32(to_utf8(s));
}
std::u32string read_with_bom(std::istream & src)
{
enum encoding {
encoding_utf32be = 0,
encoding_utf32le,
encoding_utf16be,
encoding_utf16le,
encoding_utf8,
encoding_ascii,
};
std::vector<std::string> boms = {
std::string("\x00\x00\xFE\xFF", 4),
std::string("\xFF\xFE\x00\x00", 4),
std::string("\xFE\xFF", 2),
std::string("\xFF\xFE", 2),
std::string("\xEF\xBB\xBF", 3)
};
std::string buffer((std::istreambuf_iterator<char>(src)), std::istreambuf_iterator<char>());
encoding enc = encoding_ascii;
for (unsigned int i = 0; i < boms.size(); ++i) {
std::string testBom = boms[i];
if (buffer.compare(0, testBom.length(), testBom) == 0) {
enc = encoding(i);
buffer = buffer.substr(testBom.length());
break;
}
}
switch (enc) {
case encoding_utf32be:
{
if (buffer.length() % 4 != 0) {
throw std::logic_error("size in bytes must be a multiple of 4");
}
int count = buffer.length() / 4;
std::u32string temp = std::u32string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char32_t>(buffer[i * 4 + 3] << 0 | buffer[i * 4 + 2] << 8 | buffer[i * 4 + 1] << 16 | buffer[i * 4 + 0] << 24);
}
return temp;
}
case encoding_utf32le:
{
if (buffer.length() % 4 != 0) {
throw std::logic_error("size in bytes must be a multiple of 4");
}
int count = buffer.length() / 4;
std::u32string temp = std::u32string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char32_t>(buffer[i * 4 + 0] << 0 | buffer[i * 4 + 1] << 8 | buffer[i * 4 + 2] << 16 | buffer[i * 4 + 3] << 24);
}
return temp;
}
case encoding_utf16be:
{
if (buffer.length() % 2 != 0) {
throw std::logic_error("size in bytes must be a multiple of 2");
}
int count = buffer.length() / 2;
std::u16string temp = std::u16string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char16_t>(buffer[i * 2 + 1] << 0 | buffer[i * 2 + 0] << 8);
}
return to_utf32(temp);
}
case encoding_utf16le:
{
if (buffer.length() % 2 != 0) {
throw std::logic_error("size in bytes must be a multiple of 2");
}
int count = buffer.length() / 2;
std::u16string temp = std::u16string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char16_t>(buffer[i * 2 + 0] << 0 | buffer[i * 2 + 1] << 8);
}
return to_utf32(temp);
}
default:
return to_utf32(buffer);
}
}
我已经编写了一个 utf_ranges 库来完成这项工作。它使用 Range-V3 和 C++14。
它具有在三种主要 UTF 编码中的任何一种之间进行转换的视图和操作(如果您熟悉 Range-V3 术语),可以使用和生成字节顺序标记,并根据 bom 执行字节序转换.例如,将文件从未知端 UTF-16 读入 UTF-8 std::string
,将七个 unicode 行结尾中的任何一个转换为 \n
,如下所示:
std::ifstream source{path, std::ios::binary};
std::string str = utf::istreambuf<char16_t>(source)
| utf::view::consume_bom
| utf::view::utf8
| utf::view::line_end_convert;
这是我来自 Baby X 的 UTF-8 代码
(https://github.com/MalcolmMcLean/babyx)
static const unsigned int offsetsFromUTF8[6] =
{
0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
};
static const unsigned char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
int bbx_isutf8z(const char *str)
{
int len = 0;
int pos = 0;
int nb;
int i;
int ch;
while(str[len])
len++;
while(pos < len && *str)
{
nb = bbx_utf8_skip(str);
if(nb < 1 || nb > 4)
return 0;
if(pos + nb > len)
return 0;
for(i=1;i<nb;i++)
if( (str[i] & 0xC0) != 0x80 )
return 0;
ch = bbx_utf8_getch(str);
if(ch < 0x80)
{
if(nb != 1)
return 0;
}
else if(ch < 0x8000)
{
if(nb != 2)
return 0;
}
else if(ch < 0x10000)
{
if(nb != 3)
return 0;
}
else if(ch < 0x110000)
{
if(nb != 4)
return 0;
}
pos += nb;
str += nb;
}
return 1;
}
int bbx_utf8_skip(const char *utf8)
{
return trailingBytesForUTF8[(unsigned char) *utf8] + 1;
}
int bbx_utf8_getch(const char *utf8)
{
int ch;
int nb;
nb = trailingBytesForUTF8[(unsigned char)*utf8];
ch = 0;
switch (nb)
{
/* these fall through deliberately */
case 3: ch += (unsigned char)*utf8++; ch <<= 6;
case 2: ch += (unsigned char)*utf8++; ch <<= 6;
case 1: ch += (unsigned char)*utf8++; ch <<= 6;
case 0: ch += (unsigned char)*utf8++;
}
ch -= offsetsFromUTF8[nb];
return ch;
}
int bbx_utf8_putch(char *out, int ch)
{
char *dest = out;
if (ch < 0x80)
{
*dest++ = (char)ch;
}
else if (ch < 0x800)
{
*dest++ = (ch>>6) | 0xC0;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x10000)
{
*dest++ = (ch>>12) | 0xE0;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x110000)
{
*dest++ = (ch>>18) | 0xF0;
*dest++ = ((ch>>12) & 0x3F) | 0x80;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
else
return 0;
return dest - out;
}
int bbx_utf8_charwidth(int ch)
{
if (ch < 0x80)
{
return 1;
}
else if (ch < 0x800)
{
return 2;
}
else if (ch < 0x10000)
{
return 3;
}
else if (ch < 0x110000)
{
return 4;
}
else
return 0;
}
int bbx_utf8_Nchars(const char *utf8)
{
int answer = 0;
while(*utf8)
{
utf8 += bbx_utf8_skip(utf8);
answer++;
}
return answer;
}
我正在寻找用于在 C++11 中执行 UTF 字符转换的函数集合。它应该包括与 utf8、utf16 和 utf32 之间的任何转换。识别字节顺序标记的功能也会有所帮助。
更新:此处列出的功能在 GitHub 存储库 .hpp, .cpp and tests 中维护。某些 UTF-16 函数已被禁用,因为它们无法正常工作。 utf.test.cpp 文件中的 "banana" 测试证明了这个问题。
还包含一个 "read_with_bom" 识别字节顺序标记的函数。
#if _MSC_VER == 1900 //work around for bug in MS Visual C++ 2015 https://social.msdn.microsoft.com/Forums/en-US/8f40dcd8-c67f-4eba-9134-a19b9178e481/vs-2015-rc-linker-stdcodecvt-error?forum=vcgeneral
std::string to_utf8(const std::u16string &s)
{
std::wstring_convert<std::codecvt_utf8<int16_t>, int16_t> convert;
auto p = reinterpret_cast<const int16_t *>(s.data());
return convert.to_bytes(p, p + s.size());
}
std::string to_utf8(const std::u32string &s)
{
std::wstring_convert<std::codecvt_utf8<int32_t>, int32_t> convert;
auto p = reinterpret_cast<const int32_t *>(s.data());
return convert.to_bytes(p, p + s.size());
}
std::u16string to_utf16(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<int16_t>, int16_t> convert;
auto asInt = convert.from_bytes(s);
return std::u16string(reinterpret_cast<char16_t const *>(asInt.data()), asInt.length());
}
std::u32string to_utf32(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<int32_t>, int32_t> convert;
auto asInt = convert.from_bytes(s);
return std::u32string(reinterpret_cast<char32_t const *>(asInt.data()), asInt.length());
}
#else
std::string to_utf8(const std::u16string &s)
{
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> conv;
return conv.to_bytes(s);
}
std::string to_utf8(const std::u32string &s)
{
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
return conv.to_bytes(s);
}
std::u16string to_utf16(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> convert;
return convert.from_bytes(s);
}
std::u32string to_utf32(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
return conv.from_bytes(s);
}
#endif
std::u16string to_utf16(const std::u32string &s)
{
return to_utf16(to_utf8(s));
}
std::u32string to_utf32(const std::u16string &s) {
return to_utf32(to_utf8(s));
}
std::u32string read_with_bom(std::istream & src)
{
enum encoding {
encoding_utf32be = 0,
encoding_utf32le,
encoding_utf16be,
encoding_utf16le,
encoding_utf8,
encoding_ascii,
};
std::vector<std::string> boms = {
std::string("\x00\x00\xFE\xFF", 4),
std::string("\xFF\xFE\x00\x00", 4),
std::string("\xFE\xFF", 2),
std::string("\xFF\xFE", 2),
std::string("\xEF\xBB\xBF", 3)
};
std::string buffer((std::istreambuf_iterator<char>(src)), std::istreambuf_iterator<char>());
encoding enc = encoding_ascii;
for (unsigned int i = 0; i < boms.size(); ++i) {
std::string testBom = boms[i];
if (buffer.compare(0, testBom.length(), testBom) == 0) {
enc = encoding(i);
buffer = buffer.substr(testBom.length());
break;
}
}
switch (enc) {
case encoding_utf32be:
{
if (buffer.length() % 4 != 0) {
throw std::logic_error("size in bytes must be a multiple of 4");
}
int count = buffer.length() / 4;
std::u32string temp = std::u32string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char32_t>(buffer[i * 4 + 3] << 0 | buffer[i * 4 + 2] << 8 | buffer[i * 4 + 1] << 16 | buffer[i * 4 + 0] << 24);
}
return temp;
}
case encoding_utf32le:
{
if (buffer.length() % 4 != 0) {
throw std::logic_error("size in bytes must be a multiple of 4");
}
int count = buffer.length() / 4;
std::u32string temp = std::u32string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char32_t>(buffer[i * 4 + 0] << 0 | buffer[i * 4 + 1] << 8 | buffer[i * 4 + 2] << 16 | buffer[i * 4 + 3] << 24);
}
return temp;
}
case encoding_utf16be:
{
if (buffer.length() % 2 != 0) {
throw std::logic_error("size in bytes must be a multiple of 2");
}
int count = buffer.length() / 2;
std::u16string temp = std::u16string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char16_t>(buffer[i * 2 + 1] << 0 | buffer[i * 2 + 0] << 8);
}
return to_utf32(temp);
}
case encoding_utf16le:
{
if (buffer.length() % 2 != 0) {
throw std::logic_error("size in bytes must be a multiple of 2");
}
int count = buffer.length() / 2;
std::u16string temp = std::u16string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char16_t>(buffer[i * 2 + 0] << 0 | buffer[i * 2 + 1] << 8);
}
return to_utf32(temp);
}
default:
return to_utf32(buffer);
}
}
我已经编写了一个 utf_ranges 库来完成这项工作。它使用 Range-V3 和 C++14。
它具有在三种主要 UTF 编码中的任何一种之间进行转换的视图和操作(如果您熟悉 Range-V3 术语),可以使用和生成字节顺序标记,并根据 bom 执行字节序转换.例如,将文件从未知端 UTF-16 读入 UTF-8 std::string
,将七个 unicode 行结尾中的任何一个转换为 \n
,如下所示:
std::ifstream source{path, std::ios::binary};
std::string str = utf::istreambuf<char16_t>(source)
| utf::view::consume_bom
| utf::view::utf8
| utf::view::line_end_convert;
这是我来自 Baby X 的 UTF-8 代码 (https://github.com/MalcolmMcLean/babyx)
static const unsigned int offsetsFromUTF8[6] =
{
0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
};
static const unsigned char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
int bbx_isutf8z(const char *str)
{
int len = 0;
int pos = 0;
int nb;
int i;
int ch;
while(str[len])
len++;
while(pos < len && *str)
{
nb = bbx_utf8_skip(str);
if(nb < 1 || nb > 4)
return 0;
if(pos + nb > len)
return 0;
for(i=1;i<nb;i++)
if( (str[i] & 0xC0) != 0x80 )
return 0;
ch = bbx_utf8_getch(str);
if(ch < 0x80)
{
if(nb != 1)
return 0;
}
else if(ch < 0x8000)
{
if(nb != 2)
return 0;
}
else if(ch < 0x10000)
{
if(nb != 3)
return 0;
}
else if(ch < 0x110000)
{
if(nb != 4)
return 0;
}
pos += nb;
str += nb;
}
return 1;
}
int bbx_utf8_skip(const char *utf8)
{
return trailingBytesForUTF8[(unsigned char) *utf8] + 1;
}
int bbx_utf8_getch(const char *utf8)
{
int ch;
int nb;
nb = trailingBytesForUTF8[(unsigned char)*utf8];
ch = 0;
switch (nb)
{
/* these fall through deliberately */
case 3: ch += (unsigned char)*utf8++; ch <<= 6;
case 2: ch += (unsigned char)*utf8++; ch <<= 6;
case 1: ch += (unsigned char)*utf8++; ch <<= 6;
case 0: ch += (unsigned char)*utf8++;
}
ch -= offsetsFromUTF8[nb];
return ch;
}
int bbx_utf8_putch(char *out, int ch)
{
char *dest = out;
if (ch < 0x80)
{
*dest++ = (char)ch;
}
else if (ch < 0x800)
{
*dest++ = (ch>>6) | 0xC0;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x10000)
{
*dest++ = (ch>>12) | 0xE0;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x110000)
{
*dest++ = (ch>>18) | 0xF0;
*dest++ = ((ch>>12) & 0x3F) | 0x80;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
else
return 0;
return dest - out;
}
int bbx_utf8_charwidth(int ch)
{
if (ch < 0x80)
{
return 1;
}
else if (ch < 0x800)
{
return 2;
}
else if (ch < 0x10000)
{
return 3;
}
else if (ch < 0x110000)
{
return 4;
}
else
return 0;
}
int bbx_utf8_Nchars(const char *utf8)
{
int answer = 0;
while(*utf8)
{
utf8 += bbx_utf8_skip(utf8);
answer++;
}
return answer;
}