为什么多字节字符到 char32_t 的转换使用 UTF-8 作为多字节编码而不是特定于语言环境的编码？

Question

我一直在尝试通过首先将接收到的输入转换为 UTF-32 编码的 char32_t，然后将其转换为UTF-8。我一直在从 <uchar.h> 调用函数 mbtoc32 来完成这项工作，但是它一直在发送 "Encoding error".

以下是我遇到的情况：

通过 mbstowcs 将序列 (Big5) 转换为 wchar_t 表示成功。
mbrtoc32 将多字节序列作为 UTF-8，但语言环境不是。（在我的机器上设置为""、returns "Chinese (Traditional)_Hong Kong SAR.950"）

下面是我一直在编写的代码，试图调试我的问题，但没有成功。它尝试将“香”汉字（U+9999）转换为多字节表示，然后尝试将“香”（0xADBB）的Big5编码转换为wchar_t和 char32_t。但是，从多字节 (Big5) 转换为 char32_t returns 编码错误。（矛盾的是，将“香”的UTF-8序列输入mbrtoc32就成功return0x9999）

#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>

mbstate_t state;
int main(void){
    setlocale(LC_CTYPE, "");
    printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
    char32_t chi_c = 0x9999;
    printf("Character U+9999 is 香\n");
    char *mbc = (char *)calloc(32, sizeof(char));
    size_t mb_len;
    mb_len = c32rtomb(mbc, chi_c, &state);
    int i;
    printf("The multibyte representation of U+9999 is:\n");
    // 0xE9A699, UTF-8
    for (i = 0; i < mb_len; i++){
        printf("%#2x\t", *(mbc + i));
    }
    char *src_mbs = (char *)calloc(32, sizeof(char));
    // "香" in Big5 encoding
    *(src_mbs + 0) = 0xad;
    *(src_mbs + 1) = 0xbb;
    wchar_t res_wc;
    mbtowc(&res_wc, src_mbs, 32); // Success, res_wc == 0x9999
    char32_t res_c32;
    mb_len = mbrtoc32(&res_c32, src_mbs, (size_t)3, &state);
    // Returns (size_t)-1, encoding error
    if (mb_len == (size_t)-1){
        perror("Encoding error");
        return errno;
    }
    else {
        printf("\nThe 32-bit character representation of U+9999 is:\n%#x", res_wc);
    }
    return 0;
}

我还阅读了 cppreference.com 的文档，它说，

In any case, the multibyte character encoding used by this function is specified by the currently active C locale.

我希望 mbrtoc32 的行为类似于 mbtowc，它将字符从特定于语言环境的编码转换为 UTF-32（在本例中Big5 到 UTF-32）。

是否有任何解决方案可以使用 mbrtoc32 将多字节字符转换为 char32_t 而无需 "Encoding error"？

P.S.: 我在 Windows 10 上使用 Mingw-64，用 gcc 编译。

Answer 1

我找到问题了。我正在使用的 Mingw-w64 期望传递给 mbrtoc32 和 c32rtomb 的所有多字节字符串都在 UTF-8 编码.

mbrtoc32 的代码：

size_t mbrtoc32 (char32_t *__restrict__ pc32,
         const char *__restrict__ s,
         size_t n,
         mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
    if (*s == 0)
    {
    *pc32 = 0;
    return 0;
    }

    /* ASCII character - high bit unset */
    if ((*s & 0x80) == 0)
    {
    *pc32 = *s;
    return 1;
    }

    /* Multibyte chars */
    if ((*s & 0xE0) == 0xC0) /* 110xxxxx needs 2 bytes */
    {
    if (n < 2)
        return (size_t)-2;

    *pc32 = ((s[0] & 31) << 6) | (s[1] & 63);
    return 2;
    }
    else if ((*s & 0xf0) == 0xE0) /* 1110xxxx needs 3 bytes */
    {
    if (n < 3)
        return (size_t)-2;

    *pc32 = ((s[0] & 15) << 12) | ((s[1] & 63) << 6) | (s[2] & 63);
    return 3;
    }
    else if ((*s & 0xF8) == 0xF0) /* 11110xxx needs 4 bytes */
    {
    if (n < 4)
        return (size_t)-2;

    *pc32 = ((s[0] & 7) << 18) | ((s[1] & 63) << 12) | ((s[2] & 63) << 6) | (s[4] & 63);
    return 4;
    }

    errno = EILSEQ;
    return (size_t)-1;
}

和 c32rtomb:

size_t c32rtomb (char *__restrict__ s,
         char32_t c32,
         mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
    if (c32 <= 0x7F) /* 7 bits needs 1 byte */
    {
    *s = (char)c32 & 0x7F;
    return 1;
    }
    else if (c32 <= 0x7FF) /* 11 bits needs 2 bytes */
    {
    s[1] = 0x80 | (char)(c32 & 0x3F);
    s[0] = 0xC0 | (char)(c32 >> 6);
    return 2;
    }
    else if (c32 <= 0xFFFF) /* 16 bits needs 3 bytes */
    {
    s[2] = 0x80 | (char)(c32 & 0x3F);
    s[1] = 0x80 | (char)((c32 >> 6) & 0x3F);
    s[0] = 0xE0 | (char)(c32 >> 12);
    return 3;
    }
    else if (c32 <= 0x1FFFFF) /* 21 bits needs 4 bytes */
    {
    s[3] = 0x80 | (char)(c32 & 0x3F);
    s[2] = 0x80 | (char)((c32 >> 6) & 0x3F);
    s[1] = 0x80 | (char)((c32 >> 12) & 0x3F);
    s[0] = 0xF0 | (char)(c32 >> 18);
    return 4;
    }

    errno = EILSEQ;
    return (size_t)-1;
}

这两个函数都希望给定的多字节字符串为 UTF-8，而不考虑语言环境设置。函数 mbrtoc32 and c32rtomb on glibc 只需调用对应的宽字符函数来转换字符。作为宽字符转换在 Mingw-w64 上正常工作，我使用 mbrtowc 和 wcrtomb 分别替换 mbrtoc32 和 c32rtomb继续 glibc:

#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>

mbstate_t state;
int main(void){
    setlocale(LC_CTYPE, "");
    printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
    char *src_mbs = "\xad\xbb"; // "香" in Big5 encoding
    char32_t src_c32 = 0x9999; // "香" code point
    unsigned char *r_mbc = (char *)calloc(32, sizeof(char));
    if (r_mbc == NULL){
        perror("Failed to allocate memory");
        return errno;
    }
    size_t mb_len = wcrtomb(r_mbc, (wchar_t)src_c32, &state); // Returns 0xADBB, Big5 of "香", OK
    printf("Character U+9999 is %s, ( ", r_mbc);
    for (int i = 0; i < mb_len; i++){
        printf("%#hhx ", *(r_mbc + i));
    }
    printf(")\n");
    // mb_len = c32rtomb(r_mbc, src_c32, &state); // Returns 0xE9A699, UTF-8 representation of "香", expected Big5
    // printf("\nThe multibyte representation of U+9999 is:\n");
    // for (i = 0; i < mb_len; i++){
    //     printf("%#hhX\t", *(r_mbc + i));
    // }
    char32_t r_c32 = 0;
    // mb_len = mbrtoc32(&r_c32, src_mbs, (size_t)3, &state);
    // Returns (size_t)-1, encoding error
    mb_len = mbrtowc((wchar_t *)&r_c32, src_mbs, (size_t)3, &state); // Returns 0x9999, OK
    if (mb_len == (size_t)-1){
        perror("Encoding error");
        return errno;
    }
    else {
        printf("\nThe 32-bit character representation of U+9999 is:\n%#x", r_c32);
    }
    return 0;
}

为什么多字节字符到 char32_t 的转换使用 UTF-8 作为多字节编码而不是特定于语言环境的编码？

Why does multibyte character to char32_t conversion use UTF-8 as the multibyte encoding instead of the locale-specific one?

c

unicode

console

encoding

mingw-w64

以下是我遇到的情况：