为什么多字节字符到 char32_t 的转换使用 UTF-8 作为多字节编码而不是特定于语言环境的编码?
Why does multibyte character to char32_t conversion use UTF-8 as the multibyte encoding instead of the locale-specific one?
我一直在尝试通过首先将接收到的输入转换为 UTF-32 编码的 char32_t
,然后将其转换为UTF-8。我一直在从 <uchar.h>
调用函数 mbtoc32
来完成这项工作,但是它一直在发送 "Encoding error".
以下是我遇到的情况:
- 通过
mbstowcs
将序列 (Big5) 转换为 wchar_t
表示成功。
mbrtoc32
将多字节序列作为 UTF-8,但语言环境不是。 (在我的机器上设置为""
、returns "Chinese (Traditional)_Hong Kong SAR.950")
下面是我一直在编写的代码,试图调试我的问题,但没有成功。它尝试将“香”汉字(U+9999)转换为多字节表示,然后尝试将“香”(0xADBB)的Big5编码转换为wchar_t
和 char32_t
。但是,从多字节 (Big5) 转换为 char32_t
returns 编码错误。 (矛盾的是,将“香”的UTF-8序列输入mbrtoc32
就成功return0x9999)
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char32_t chi_c = 0x9999;
printf("Character U+9999 is 香\n");
char *mbc = (char *)calloc(32, sizeof(char));
size_t mb_len;
mb_len = c32rtomb(mbc, chi_c, &state);
int i;
printf("The multibyte representation of U+9999 is:\n");
// 0xE9A699, UTF-8
for (i = 0; i < mb_len; i++){
printf("%#2x\t", *(mbc + i));
}
char *src_mbs = (char *)calloc(32, sizeof(char));
// "香" in Big5 encoding
*(src_mbs + 0) = 0xad;
*(src_mbs + 1) = 0xbb;
wchar_t res_wc;
mbtowc(&res_wc, src_mbs, 32); // Success, res_wc == 0x9999
char32_t res_c32;
mb_len = mbrtoc32(&res_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", res_wc);
}
return 0;
}
我还阅读了 cppreference.com 的文档,它说,
In any case, the multibyte character encoding used by this function is specified by the currently active C locale.
我希望 mbrtoc32
的行为类似于 mbtowc
,它将字符从特定于语言环境的编码转换为 UTF-32(在本例中Big5 到 UTF-32)。
是否有任何解决方案可以使用 mbrtoc32
将多字节字符转换为 char32_t
而无需 "Encoding error"?
P.S.: 我在 Windows 10 上使用 Mingw-64,用 gcc 编译。
我找到问题了。我正在使用的 Mingw-w64 期望传递给 mbrtoc32
和 c32rtomb
的所有多字节字符串都在 UTF-8 编码.
mbrtoc32
的代码:
size_t mbrtoc32 (char32_t *__restrict__ pc32,
const char *__restrict__ s,
size_t n,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (*s == 0)
{
*pc32 = 0;
return 0;
}
/* ASCII character - high bit unset */
if ((*s & 0x80) == 0)
{
*pc32 = *s;
return 1;
}
/* Multibyte chars */
if ((*s & 0xE0) == 0xC0) /* 110xxxxx needs 2 bytes */
{
if (n < 2)
return (size_t)-2;
*pc32 = ((s[0] & 31) << 6) | (s[1] & 63);
return 2;
}
else if ((*s & 0xf0) == 0xE0) /* 1110xxxx needs 3 bytes */
{
if (n < 3)
return (size_t)-2;
*pc32 = ((s[0] & 15) << 12) | ((s[1] & 63) << 6) | (s[2] & 63);
return 3;
}
else if ((*s & 0xF8) == 0xF0) /* 11110xxx needs 4 bytes */
{
if (n < 4)
return (size_t)-2;
*pc32 = ((s[0] & 7) << 18) | ((s[1] & 63) << 12) | ((s[2] & 63) << 6) | (s[4] & 63);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
和 c32rtomb
:
size_t c32rtomb (char *__restrict__ s,
char32_t c32,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (c32 <= 0x7F) /* 7 bits needs 1 byte */
{
*s = (char)c32 & 0x7F;
return 1;
}
else if (c32 <= 0x7FF) /* 11 bits needs 2 bytes */
{
s[1] = 0x80 | (char)(c32 & 0x3F);
s[0] = 0xC0 | (char)(c32 >> 6);
return 2;
}
else if (c32 <= 0xFFFF) /* 16 bits needs 3 bytes */
{
s[2] = 0x80 | (char)(c32 & 0x3F);
s[1] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[0] = 0xE0 | (char)(c32 >> 12);
return 3;
}
else if (c32 <= 0x1FFFFF) /* 21 bits needs 4 bytes */
{
s[3] = 0x80 | (char)(c32 & 0x3F);
s[2] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[1] = 0x80 | (char)((c32 >> 12) & 0x3F);
s[0] = 0xF0 | (char)(c32 >> 18);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
这两个函数都希望给定的多字节字符串为 UTF-8,而不考虑语言环境设置。函数 mbrtoc32
and c32rtomb
on glibc 只需调用对应的宽字符函数来转换字符。作为
宽字符转换在 Mingw-w64 上正常工作,我使用 mbrtowc
和 wcrtomb
分别替换 mbrtoc32
和 c32rtomb
继续 glibc:
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char *src_mbs = "\xad\xbb"; // "香" in Big5 encoding
char32_t src_c32 = 0x9999; // "香" code point
unsigned char *r_mbc = (char *)calloc(32, sizeof(char));
if (r_mbc == NULL){
perror("Failed to allocate memory");
return errno;
}
size_t mb_len = wcrtomb(r_mbc, (wchar_t)src_c32, &state); // Returns 0xADBB, Big5 of "香", OK
printf("Character U+9999 is %s, ( ", r_mbc);
for (int i = 0; i < mb_len; i++){
printf("%#hhx ", *(r_mbc + i));
}
printf(")\n");
// mb_len = c32rtomb(r_mbc, src_c32, &state); // Returns 0xE9A699, UTF-8 representation of "香", expected Big5
// printf("\nThe multibyte representation of U+9999 is:\n");
// for (i = 0; i < mb_len; i++){
// printf("%#hhX\t", *(r_mbc + i));
// }
char32_t r_c32 = 0;
// mb_len = mbrtoc32(&r_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
mb_len = mbrtowc((wchar_t *)&r_c32, src_mbs, (size_t)3, &state); // Returns 0x9999, OK
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", r_c32);
}
return 0;
}
我一直在尝试通过首先将接收到的输入转换为 UTF-32 编码的 char32_t
,然后将其转换为UTF-8。我一直在从 <uchar.h>
调用函数 mbtoc32
来完成这项工作,但是它一直在发送 "Encoding error".
以下是我遇到的情况:
- 通过
mbstowcs
将序列 (Big5) 转换为wchar_t
表示成功。 mbrtoc32
将多字节序列作为 UTF-8,但语言环境不是。 (在我的机器上设置为""
、returns "Chinese (Traditional)_Hong Kong SAR.950")
下面是我一直在编写的代码,试图调试我的问题,但没有成功。它尝试将“香”汉字(U+9999)转换为多字节表示,然后尝试将“香”(0xADBB)的Big5编码转换为wchar_t
和 char32_t
。但是,从多字节 (Big5) 转换为 char32_t
returns 编码错误。 (矛盾的是,将“香”的UTF-8序列输入mbrtoc32
就成功return0x9999)
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char32_t chi_c = 0x9999;
printf("Character U+9999 is 香\n");
char *mbc = (char *)calloc(32, sizeof(char));
size_t mb_len;
mb_len = c32rtomb(mbc, chi_c, &state);
int i;
printf("The multibyte representation of U+9999 is:\n");
// 0xE9A699, UTF-8
for (i = 0; i < mb_len; i++){
printf("%#2x\t", *(mbc + i));
}
char *src_mbs = (char *)calloc(32, sizeof(char));
// "香" in Big5 encoding
*(src_mbs + 0) = 0xad;
*(src_mbs + 1) = 0xbb;
wchar_t res_wc;
mbtowc(&res_wc, src_mbs, 32); // Success, res_wc == 0x9999
char32_t res_c32;
mb_len = mbrtoc32(&res_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", res_wc);
}
return 0;
}
我还阅读了 cppreference.com 的文档,它说,
In any case, the multibyte character encoding used by this function is specified by the currently active C locale.
我希望 mbrtoc32
的行为类似于 mbtowc
,它将字符从特定于语言环境的编码转换为 UTF-32(在本例中Big5 到 UTF-32)。
是否有任何解决方案可以使用 mbrtoc32
将多字节字符转换为 char32_t
而无需 "Encoding error"?
P.S.: 我在 Windows 10 上使用 Mingw-64,用 gcc 编译。
我找到问题了。我正在使用的 Mingw-w64 期望传递给 mbrtoc32
和 c32rtomb
的所有多字节字符串都在 UTF-8 编码.
mbrtoc32
的代码:
size_t mbrtoc32 (char32_t *__restrict__ pc32,
const char *__restrict__ s,
size_t n,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (*s == 0)
{
*pc32 = 0;
return 0;
}
/* ASCII character - high bit unset */
if ((*s & 0x80) == 0)
{
*pc32 = *s;
return 1;
}
/* Multibyte chars */
if ((*s & 0xE0) == 0xC0) /* 110xxxxx needs 2 bytes */
{
if (n < 2)
return (size_t)-2;
*pc32 = ((s[0] & 31) << 6) | (s[1] & 63);
return 2;
}
else if ((*s & 0xf0) == 0xE0) /* 1110xxxx needs 3 bytes */
{
if (n < 3)
return (size_t)-2;
*pc32 = ((s[0] & 15) << 12) | ((s[1] & 63) << 6) | (s[2] & 63);
return 3;
}
else if ((*s & 0xF8) == 0xF0) /* 11110xxx needs 4 bytes */
{
if (n < 4)
return (size_t)-2;
*pc32 = ((s[0] & 7) << 18) | ((s[1] & 63) << 12) | ((s[2] & 63) << 6) | (s[4] & 63);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
和 c32rtomb
:
size_t c32rtomb (char *__restrict__ s,
char32_t c32,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (c32 <= 0x7F) /* 7 bits needs 1 byte */
{
*s = (char)c32 & 0x7F;
return 1;
}
else if (c32 <= 0x7FF) /* 11 bits needs 2 bytes */
{
s[1] = 0x80 | (char)(c32 & 0x3F);
s[0] = 0xC0 | (char)(c32 >> 6);
return 2;
}
else if (c32 <= 0xFFFF) /* 16 bits needs 3 bytes */
{
s[2] = 0x80 | (char)(c32 & 0x3F);
s[1] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[0] = 0xE0 | (char)(c32 >> 12);
return 3;
}
else if (c32 <= 0x1FFFFF) /* 21 bits needs 4 bytes */
{
s[3] = 0x80 | (char)(c32 & 0x3F);
s[2] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[1] = 0x80 | (char)((c32 >> 12) & 0x3F);
s[0] = 0xF0 | (char)(c32 >> 18);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
这两个函数都希望给定的多字节字符串为 UTF-8,而不考虑语言环境设置。函数 mbrtoc32
and c32rtomb
on glibc 只需调用对应的宽字符函数来转换字符。作为
宽字符转换在 Mingw-w64 上正常工作,我使用 mbrtowc
和 wcrtomb
分别替换 mbrtoc32
和 c32rtomb
继续 glibc:
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char *src_mbs = "\xad\xbb"; // "香" in Big5 encoding
char32_t src_c32 = 0x9999; // "香" code point
unsigned char *r_mbc = (char *)calloc(32, sizeof(char));
if (r_mbc == NULL){
perror("Failed to allocate memory");
return errno;
}
size_t mb_len = wcrtomb(r_mbc, (wchar_t)src_c32, &state); // Returns 0xADBB, Big5 of "香", OK
printf("Character U+9999 is %s, ( ", r_mbc);
for (int i = 0; i < mb_len; i++){
printf("%#hhx ", *(r_mbc + i));
}
printf(")\n");
// mb_len = c32rtomb(r_mbc, src_c32, &state); // Returns 0xE9A699, UTF-8 representation of "香", expected Big5
// printf("\nThe multibyte representation of U+9999 is:\n");
// for (i = 0; i < mb_len; i++){
// printf("%#hhX\t", *(r_mbc + i));
// }
char32_t r_c32 = 0;
// mb_len = mbrtoc32(&r_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
mb_len = mbrtowc((wchar_t *)&r_c32, src_mbs, (size_t)3, &state); // Returns 0x9999, OK
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", r_c32);
}
return 0;
}