将字节数组从 utf-16 转换为 utf-8
Convert byte array from utf-16 to utf-8
我有一个字节数组
uint8_t array[] = {0x00, 0x72, 0x00, 0x6f, 0x00, 0x6f, 0x00, 0x74};
我知道,在文本中这是 "root";
我有一个函数可以将 utf-16 转换为 utf-8。这是代码:
inline bool convertUcs2ToUtf8(const std::vector<char> &from, std::string* const to) {
return ucnvConvert("UTF-16", "UTF-8", from, to);
}
static inline bool ucnvConvert(const char *enc_from,
const char *enc_to,
const std::vector<char> &from,
std::string* const to)
{
if (from.empty()) {
to->clear();
return true;
}
unsigned int maxOutSize = from.size() * 3 + 1;
std::vector<char> outBuf(maxOutSize);
iconv_t c = iconv_open(enc_to, enc_from);
ASSERT_MSG(c != NULL, "convert: illegal encodings");
char *from_ptr = const_cast<char*>(from.data());
char *to_ptr = &outBuf[0];
size_t inleft = from.size(), outleft = maxOutSize;
size_t n = iconv(c, &from_ptr, &inleft, &to_ptr, &outleft);
bool success = true;
if (n == (size_t)-1) {
success = false;
if (errno == E2BIG) {
ELOG("convert: insufficient space from");
} else if (errno == EILSEQ) {
ELOG("convert: invalid input sequence");
} else if (errno == EINVAL) {
ELOG("convert: incomplete input sequence");
}
}
if (success) {
to->assign(&outBuf[0], maxOutSize - outleft);
}
iconv_close(c);
return success;
}
它适用于西里尔文(它以 0x04 开头),但是当我尝试将我的数组放入其中时,我得到类似的东西:
爀漀漀琀开㌀㜀
等等...
这里有什么问题吗?
必须为 UTF-16 输入指定字节顺序。由于您传递的是 utf16-be
(大端)编码缓冲区,因此您应该在其前面加上适当的字节顺序标记:
uint8_t array[] = { 0xfe, 0xff, 0x00, 0x72, 0x00, 0x6f, 0x00, 0x6f, 0x00, 0x74 };
但这会生成带有您可能不需要的字节顺序标记的 UTF-8 输出。最有效的方法就是这样指定字节顺序:
ucnvConvert("UTF-16BE", "UTF-8", from, to);
我有一个字节数组
uint8_t array[] = {0x00, 0x72, 0x00, 0x6f, 0x00, 0x6f, 0x00, 0x74};
我知道,在文本中这是 "root"; 我有一个函数可以将 utf-16 转换为 utf-8。这是代码:
inline bool convertUcs2ToUtf8(const std::vector<char> &from, std::string* const to) {
return ucnvConvert("UTF-16", "UTF-8", from, to);
}
static inline bool ucnvConvert(const char *enc_from,
const char *enc_to,
const std::vector<char> &from,
std::string* const to)
{
if (from.empty()) {
to->clear();
return true;
}
unsigned int maxOutSize = from.size() * 3 + 1;
std::vector<char> outBuf(maxOutSize);
iconv_t c = iconv_open(enc_to, enc_from);
ASSERT_MSG(c != NULL, "convert: illegal encodings");
char *from_ptr = const_cast<char*>(from.data());
char *to_ptr = &outBuf[0];
size_t inleft = from.size(), outleft = maxOutSize;
size_t n = iconv(c, &from_ptr, &inleft, &to_ptr, &outleft);
bool success = true;
if (n == (size_t)-1) {
success = false;
if (errno == E2BIG) {
ELOG("convert: insufficient space from");
} else if (errno == EILSEQ) {
ELOG("convert: invalid input sequence");
} else if (errno == EINVAL) {
ELOG("convert: incomplete input sequence");
}
}
if (success) {
to->assign(&outBuf[0], maxOutSize - outleft);
}
iconv_close(c);
return success;
}
它适用于西里尔文(它以 0x04 开头),但是当我尝试将我的数组放入其中时,我得到类似的东西:
爀漀漀琀开㌀㜀
等等... 这里有什么问题吗?
必须为 UTF-16 输入指定字节顺序。由于您传递的是 utf16-be
(大端)编码缓冲区,因此您应该在其前面加上适当的字节顺序标记:
uint8_t array[] = { 0xfe, 0xff, 0x00, 0x72, 0x00, 0x6f, 0x00, 0x6f, 0x00, 0x74 };
但这会生成带有您可能不需要的字节顺序标记的 UTF-8 输出。最有效的方法就是这样指定字节顺序:
ucnvConvert("UTF-16BE", "UTF-8", from, to);