如何允许我解决使用包含重音字符（ASCII 到 UTF-8）的字符串的 DOS 函数？

Question

我正在编写一个 SW，我想在其中使用 80 年代早期编写的旧 C 代码。这段代码对字符串做了一些转换。它还使用了当时 (DOS) 以 ASCII table（大于 127 的代码）编码的重音字符。

现在新系统使用的是UTF-8编码，旧的编码效果很差。我正在使用 Linux (Ubuntu 17 / gcc gcc (Ubuntu 7.2.0-8ubuntu3) 7.2.0).

我正在寻找一种解决方法，使我能够进行尽可能少的更改。我已经开始做一些测试来分析出现的问题。我制作了两个 main：一个使用 char * 字符串和 char 元素，另一个使用 wchar_t * 字符串和 wchar_t 元素。两者都无法正常工作。

第一个（使用 char * 和 char）需要，例如，当 strchr 识别多字节代码时，它不打印（printf) 以正确的方式打印多字节字符，尽管可以正确打印 char *。此外生成大量与使用多字节字符相关的警告。

第二个（使用 wchar_t * 和 char *）运行，但没有正确打印多字节字符，它们显示为“?”当它们被打印为 wchar_t 和 wchar_t * (strings).

主要 1:

#include <stdio.h>
#include <string.h>
#include <inttypes.h>

/* http://clc-wiki.net/wiki/strchr
 * standard C implementation
 */
char *_strchr(const char *s, int c);

char *_strchr(const char *s, int c)
{
    while (*s != (char)c)
        if (!*s++)
            return 0;
    return (char *)s;
}


int main()
{
    char          * p1 = NULL;
    const char    * t1 = "Sergio è un Italiano e andò via!";

    printf("Text --> %s\n\n",t1);

    for(size_t i=0;i<strlen(t1);i++) {
        printf("%02X %c|",(uint8_t)t1[i],t1[i]);
    }
    puts("\n");

    puts("Searching ò");
    /*warning: multi-character character constant [-Wmultichar]
                      p1 = strchr(t1,'ò');
                                     ^~~~
    */
    p1 = strchr(t1,'ò');
    printf("%s\n",p1-1); // -1 needs to correct the position

    /*warning: multi-character character constant [-Wmultichar]
                      p1 = _strchr(t1,'ò');
                                     ^~~~
    */
    p1 = _strchr(t1,'ò');
    printf("%s\n",p1-1);    // -1 needs to correct the position
    puts("");

    puts("Searching è");
    /*warning: multi-character character constant [-Wmultichar]
                      p1 = strchr(t1,'è');
                                     ^~~~
    */
    p1 = strchr(t1,'è');
    printf("%s\n",p1-1);    // -1 needs to correct the position

    /*warning: multi-character character constant [-Wmultichar]
                      p1 = _strchr(t1,'è');
                                     ^~~~
    */
    p1 = _strchr(t1,'è');
    printf("%s\n",p1-1);    // -1 needs to correct the position
    puts("");

    /*warning: multi-character character constant [-Wmultichar]
         printf("%c %c %08X %08X\n",'è','ò','è','ò');
                                    ^~~~
         printf("%c %c %08X %08X\n",'è','ò','è','ò');
                                        ^~~~
         printf("%c %c %08X %08X\n",'è','ò','è','ò');
                                            ^~~~
         printf("%c %c %08X %08X\n",'è','ò','è','ò');
                                                ^~~~
    */
    printf("%c %c %08X %08X\n",'è','ò','è','ò');

    /*multi-character character constant [-Wmultichar]
     printf("%c %c %08X %08X\n",'è','ò',(uint8_t)'è',(uint8_t)'ò');
                                ^~~~
     printf("%c %c %08X %08X\n",'è','ò',(uint8_t)'è',(uint8_t)'ò');
                                    ^~~~
     printf("%c %c %08X %08X\n",'è','ò',(uint8_t)'è',(uint8_t)'ò');
                                                 ^~~~
     printf("%c %c %08X %08X\n",'è','ò',(uint8_t)'è',(uint8_t)'ò');
                                                              ^~~~
    */
    printf("%c %c %08X %08X\n",'è','ò',(uint8_t)'è',(uint8_t)'ò');

    puts("");
    return 0;
}

输出：

主要 2:

#include <stdio.h>
#include <string.h>
#include <wchar.h>
#include <inttypes.h>

#define wputs(s) wprintf(s"\n")

/* https://opensource.apple.com/source/Libc/Libc-498.1.1/string/wcschr-fbsd.c
 * FBSD C implementation
 */
wchar_t * _wcschr(const wchar_t *s, wchar_t c);

wchar_t * _wcschr(const wchar_t *s, wchar_t c)
{
    while (*s != c && *s != L'[=11=]')
        s++;
    if (*s == c)
        return ((wchar_t *)s);
    return (NULL);
}

int main()
{
    wchar_t       * p1 = NULL;
    const wchar_t * t1 = L"Sergio è un Italiano e andò via!";
    const wchar_t * f0 = L"%02X %c|";
    const wchar_t * f1 = L"Text --> %ls\n\n";
    const wchar_t * f2 = L"%ls\n";

    uint8_t * p = (uint8_t *)t1;

    wprintf(f1,t1);

    for(size_t i=0;;i++) {
        uint8_t c=*(p+i);

        wprintf(f0,c,(c<' ')?'.':(c>127)?'*':c);
        if ( c=='!' )
            break;
    }
    wputs(L"\n");

    wputs(L"Searching ò");

    p1 = wcschr(t1,L'ò');
    wprintf(f2,p1);

    p1 = _wcschr(t1,L'ò');
    wprintf(f2,p1);
    wputs(L"---");

    wputs(L"Searching è");

    p1 = wcschr(t1,L'è');
    wprintf(f2,p1);

    p1 = _wcschr(t1,L'è');
    wprintf(f2,p1);
    wputs(L"");

    wprintf(L"%lc %lc %08X %08X\n",L'è',L'ò',L'è',L'ò');
    wprintf(L"%lc %lc %08X %08X\n",L'è',L'ò',(uint8_t)L'è',(uint8_t)L'ò');

    wputs(L"");

    return 0;
}

输出：

Answer 1

您需要与预期的字符编码相互转换。假设旧系统需要一些 Windows 代码页，而新代码需要 UTF-8。然后从你需要的新东西中调用旧函数：

检查您是否可以安全地执行转换（输入可能包含无法以所需 Windows 代码页形式表示的字符）...
从 UTF-8 转换为所需的 Windows 代码页表示形式。这应该在兼容表示（副本）中产生一个新的 buffer/string。
使用原始参数的新转换表示调用旧代码
在某个缓冲区中接收输出，它将以 Windows 代码页表示。
因此将该输出转换为 UTF-8 副本。
清理输入的临时副本，旧代码的原始输出缓冲区。
Return 将转换后的 UTF-8 输出复制到新代码。

如果你想从旧的东西调用新的 UTF-8 代码，你需要做反向舞蹈。

编辑：请注意，您的旧系统不能期待纯 ASCII，因为 ASCII 是 7 位编码，而 UTF-8 明确向后兼容。因此，您的首要任务是纠正您对什么是实际使用的编码的理解。

Answer 2

如果您想使用宽字符I/O，您需要本地化您的程序。这并不困难，只是一个 setlocale() 调用，加上可选的 fwide() 来查看用户区域设置是否支持所需流上的宽 I/O。

在您的 main() 中，在任何 input/output、运行

之前

    if (!setlocale(LC_ALL, "")) {
        /* Current locale is not supported
           by the C library; abort. */
    }

正如评论所说，这告诉你的 C 库，这个程序是区域设置感知的，它应该按照用户设置的区域设置规则进行设置和准备。有关详细信息，请参阅 man 7 locale。本质上，C 库不会自动选取用户设置的当前语言环境，而是使用默认的 C/POSIX 语言环境。此命令告诉 C 库尝试并符合当前设置的语言环境。

在POSIX C中，每个FILE句柄都有一个方向，可以查询和设置（但只能在读取或写入之前）使用 fwide()。请注意，它是文件句柄的属性，而不是文件本身；它只确定 C 库是使用面向字节的 (normal/narrow) 还是宽字符函数来读取和写入流。如果您不调用它，C 库会尝试根据您用来访问流的第一个 read/write 函数自动调用它，如果已设置语言环境。但是，例如使用

    if (fwide(stdout, 1) <= 0) {
        /* The C library does not support wide-character
           orientation for standard output in this locale.
           Abort.
        */
    }

在语言环境设置之后，意味着您可以检测 C 库是否不支持用户语言环境，或者用户语言环境是否根本不支持特定流的宽字符；并中止程序。（最好告诉用户结果会是垃圾，总比默默地尽力而为，并可能混淆用户数据要好。毕竟，用户总是可以使用不同的工具；但默默地混淆用户数据意味着这个特定的工具根本不值得信赖：毫无价值。）

您不能混合使用 wprintf() 和 printf()； fwprintf() 和 fprintf() 也不是同一个流。它要么失败（不打印任何内容），要么混淆 C 库，要么产生乱码结果。同样，您不得在同一流中混用 fgetc() 和 fgetwc()。简单地说，你不能在同一个流上混合面向字节或面向宽字符的函数。

这并不意味着您不能将面向字节（或多字节）的字符串打印到面向宽字符的流，反之亦然；恰恰相反。它的工作原理非常合乎逻辑，%s 和 %c 总是指一个面向字节的字符串或字符，而 %ls 和 %lc 一个宽字符串或字符。例如，如果您有

const wchar_t *ws = L"Hello";
const char     *s = "world!";

您可以使用

将它们都打印到面向字节的标准输出

printf("%ls, %s\n", ws, s);

或使用

到面向宽字符的标准输出

wprintf(L"%ls, %s\n", ws, s);

这基本上是 POSIX C 库中的一个限制：您必须对面向字节的流使用面向字节的函数，对面向宽字符的流使用面向宽字符的函数。一开始可能会觉得奇怪，但如果你仔细想想，这是非常简单明了的规则。

让我们看一个与您大致相似的示例程序；扩展为使用任何换行符约定（CR、LF、CRLF、LFCR）从标准输入逐行读取（无限长度）字符串：

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <locale.h>
#include <wchar.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

/* Function to read a wide-character line,
   using any newline convention, skipping embedded NUL bytes (L'[=15=]'),
   and dynamically reallocating the buffer as needed.
   If *lineptr==NULL and *sizeptr==0, the buffer is dynamically allocated.
   Returns the number of wide characters read.
   If an error occurs, returns zero, with errno set.
   At end of input, returns zero, with errno zero.
*/
size_t wide_line(wchar_t **lineptr, size_t *sizeptr, FILE *in)
{
    wchar_t *line;
    size_t   size, used = 0;
    wint_t   wc;

    if (!lineptr || !sizeptr) {
        errno = EINVAL;
        return 0;
    }
    if (ferror(in)) {
        errno = EIO;
        return 0;
    }

    if (*sizeptr) {
        line = *lineptr;
        size = *sizeptr;
    } else {
        *lineptr = line = NULL;
        *sizeptr = size = 0;
    }

    while (1) {

        if (used + 3 >= size) {
            /* Conservative dynamic memory reallocation policy. */
            if (used < 126)
                size = 128;
            else
            if (used < 2097152)
                size = (used * 3) / 2;
            else
                size = (used | 1048575) + 1048579;

            /* Check for size overflow. */
            if (used + 2 >= size) {
                errno = ENOMEM;
                return 0;
            }

            line = realloc(line, size * sizeof line[0]);
            if (!line) {
                errno = ENOMEM;
                return 0;
            }

            *lineptr = line;
            *sizeptr = size;
        }

        wc = fgetwc(in);
        if (wc == WEOF) {
            line[used] = L'[=15=]';
            errno = 0;
            return used;

        } else
        if (wc == L'\n') {
            line[used++] = L'\n';

            wc = fgetwc(in);
            if (wc == L'\r')
                line[used++] = L'\r';
            else
            if (wc != WEOF)
                ungetwc(wc, in);

            line[used] = L'[=15=]';
            errno = 0;
            return used;

        } else
        if (wc == L'\r') {
            line[used++] = L'\r';

            wc = fgetwc(in);
            if (wc == L'\n')
                line[used++] = L'\n';
            else
            if (wc != WEOF)
                ungetwc(wc, in);

            line[used] = L'[=15=]';
            errno = 0;
            return used;
        } else
        if (wc != L'[=15=]')
            line[used++] = wc;
    }
}

/* Returns a dynamically allocated wide string,
   with contents from a multibyte string. */
wchar_t *dup_mbstowcs(const char *src)
{
    if (src && *src) {
        wchar_t *dst;
        size_t   len, check;

        len = mbstowcs(NULL, src, 0);
        if (len == (size_t)-1) {
            errno = EILSEQ;
            return NULL;
        }

        dst = malloc((len + 1) * sizeof *dst);
        if (!dst) {
            errno = ENOMEM;
            return NULL;
        }

        check = mbstowcs(dst, src, len + 1);
        if (check != len) {
            free(dst);
            errno = EILSEQ;
            return NULL;
        }

        /* Be paranoid, and ensure the string is terminated. */
        dst[len] = L'[=15=]';
        return dst;

    } else {
        wchar_t *empty;

        empty = malloc(sizeof *empty);
        if (!empty) {
            errno = ENOMEM;
            return NULL;
        }

        *empty = L'[=15=]';
        return empty;
    }
}

int main(int argc, char *argv[])
{
    wchar_t **argw;
    wchar_t  *line = NULL;
    size_t    size = 0;
    size_t    len;
    int       arg;

    if (!setlocale(LC_ALL, "")) {
        fprintf(stderr, "Current locale is unsupported.\n");
        return EXIT_FAILURE;
    }

    if (fwide(stdin, 1) <= 0) {
        fprintf(stderr, "Standard input does not support wide characters.\n");
        return EXIT_FAILURE;
    }

    if (fwide(stdout, 1) <= 0) {
        fprintf(stderr, "Standard output does not support wide characters.\n");
        return EXIT_FAILURE;
    }

    if (argc < 2) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s WIDE-CHARACTER [ WIDE-CHARACTER ... ]\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program will look for the first instance of each wide character\n");
        fprintf(stderr, "in each line of input.\n");
        return EXIT_SUCCESS;
    }

    /* Convert command-line arguments to wide character strings. */
    argw = malloc((size_t)(argc + 1) * sizeof *argw);
    if (!argw) {
        fprintf(stderr, "Out of memory.\n");
        return EXIT_FAILURE;
    }
    for (arg = 0; arg < argc; arg++) {
        argw[arg] = dup_mbstowcs(argv[arg]);
        if (!argw[arg]) {
            fprintf(stderr, "Error converting argv[%d]: %s.\n", arg, strerror(errno));
            return EXIT_FAILURE;
        }
    }
    argw[argc] = NULL;

    while (1) {

        len = wide_line(&line, &size, stdin);
        if (!len) {
            if (errno) {
                fprintf(stderr, "Error reading standard input: %s.\n", strerror(errno));
                return EXIT_FAILURE;
            } else
            if (ferror(stdin)) {
                fprintf(stderr, "Error reading standard input.\n");
                return EXIT_FAILURE;
            }
            /* It was just an end of file, no error. */
            break;
        }

        for (arg = 1; arg < argc; arg++)
            if (argw[arg][0] != L'[=15=]') {
                wchar_t  *pos = wcschr(line, argw[arg][0]);
                if (pos) {
                    size_t  i = (size_t)(pos - line);

                    fputws(line, stdout);
                    wprintf(L"%*lc\n", (int)(i + 1), argw[arg][0]);
                }
            }

    }

    /* Because we are exiting the program,
       we don't *need* to free the line buffer we used.
       However, this is completely safe,
       and this is the way you should free the buffer. */
    free(line);
    line = NULL;
    size = 0;

    return EXIT_SUCCESS;
}

因为POSIX 没有标准化 getline() 的宽字符版本，我们将自己的变体实现为 wide_line()。它支持所有四种换行约定，并且 returns a size_t; 0（设置了 errno）如果发生错误。

由于通用换行支持，wide_line 不太适合交互式输入，因为它往往是一个字符 "late"。（对于行缓冲输入，终端往往如此，这意味着一整行延迟。）

我包含了 wide_line() 实现，因为它或非常类似的东西解决了读取在各种系统上编写的宽输入文件时的大部分问题。

当需要命令行参数为宽字符串时，dup_mbstowcs()函数最有用。它只是简单地转换为动态分配的缓冲区。本质上，argw[] 是 argv[] 数组的宽字符副本。

除了这两个函数和创建 argw[] 数组的代码外，根本没有太多代码。（随意挖走函数或整个代码，稍后在您自己的项目中使用；我认为代码在 Public Domain 中。）

如果你把上面的保存为example.c，你可以使用例如

编译它

gcc -Wall -O2 example.c -o example

如果你运行例如

printf 'Sergio è un Italiano e andò via!\n' | ./example 'o' 'ò' 'è'

输出将是

Sergio è un Italiano e andò via!
     o
Sergio è un Italiano e andò via!
                          ò
Sergio è un Italiano e andò via!
       è

缩进"trick"是，如果i是您要打印宽字符的位置，那么(i+1)就是该逻辑字段的宽度。当我们使用 * 作为打印规范中的宽度字段时，宽度是从正在打印的实际参数之前的 int 参数读取的。

如何允许我解决使用包含重音字符（ASCII 到 UTF-8）的字符串的 DOS 函数？

How am I allowed to workaround DOS functions that used strings containing accented characters (ASCII to UTF-8)?

c

string

wchar-t

char