在 cURL 中使用 UTF-8 数据流，数据流何时从 UTF-8 转换为 ASCII？

Question

使用 cURL 从 Internet 获取字符流时，数据流何时从多字节数据类型转换为单字节字符数组？

我写了一个程序，它似乎可以在回调函数中使用 ASCII。

但是，我编写了另一个使用 UTF-8 和 wchar_t 数据类型的程序，它似乎也可以工作。数据流似乎没有区分这两种数据类型，即使 wchar_t 类型在我的机器上是 4 个字节而 char 是 1 个字节。

我猜想有某种类型的转换对这个程序是透明的，但我不知道（我认为在 UTF-8 ASCII 字符中仍然占用 1 个字节的内存，但是当一个程序使用 wchar_t 数据类型，系统用零填充常规 ascii 字符，将它们转换为 4 个字节，但这不是程序员实现的...）。

#include "multicurl.h"

#define MAX_WAIT_MSECS 5*1000 /* Wait max. 5 seconds */

/*  The largest difference between the ASCII and UTF-8 variations of this program is that this callback function is now dealing with an array of wchar_t blocks rather than chars which are always 1 byte long, but it still works the same basic way. */
static size_t write_callback(wchar_t *ptr, size_t size, size_t nmemb, void *userdata){// cURL callback function [read in datastream to memory]
    // This prototype is provided by cURL, with an argument at the end for our data structure.
    // This function is repeatedly called by cURL until there is no more data in the data stream; *ptr [it is assumed cURL handles memory management for this pointer].
    
    size_t realsize = nmemb * size;// The number of bytes in the datastream [there is no NULL char]
    MemType *mem = (MemType *)userdata;
    wchar_t *tmp = realloc(mem->memory, mem->size  + realsize + sizeof(wchar_t) );// We add 1 wchar_t unit for the NULL character.

    if (tmp == NULL){
        printf("Not Enough Memory, realloc returned NULL.\n");
        exit(EXIT_FAILURE);
    }

    mem->memory = tmp;
    memcpy(&(mem->memory[ mem->size /  sizeof(wchar_t) ]), ptr, realsize );// Starting at the last element copy in datastream [it overwrites the last element]
    mem->size += realsize;// The actual size, in bytes, is realsize + ( 1 * sizeof(wchar_t) ), however realsize gives us the location of the last element.
    mem->memory[ mem->size / sizeof(wchar_t) ] = 0;// The datastream doesn't include a NULL character, so we zeroize the last element.
    // We overwrite the NULL character {the zeroized element} on the next callback iteration, if any.

    return (size * nmemb);// cURL crosschecks the datastream with this return value.
}

void *SetUpCurlHandle(CURLM * mh, wchar_t *utf8_url, MemType *output){
// Take in a multi handle pointer address, a URL and a struct pointer address, set up the curl easy handle and add it to the multi handle.

    /* Convert our UTF-8 URL string to a regular ASCII URL string. */
    char* url = (char*) malloc ( wcslen( utf8_url ) + 1 );
    wcstombs(url, utf8_url, wcslen( utf8_url ) * sizeof( wchar_t ) );

    CURL *hnd = NULL;
    output->memory = malloc( sizeof( wchar_t ) );              // Initialize the memory component of the structure.
    output->size = 0;                                           // Initialize the size component of the structure.

    // Initialize the cURL handle.
    hnd = curl_easy_init();

    if(hnd){

        // Setup the cURL options.
        curl_easy_setopt(hnd, CURLOPT_BUFFERSIZE, 102400L);
        curl_easy_setopt(hnd, CURLOPT_URL, url);// Set the request URL
        curl_easy_setopt(hnd, CURLOPT_NOPROGRESS, 1L);
        curl_easy_setopt(hnd, CURLOPT_USERAGENT, "curl/7.80.0");
        curl_easy_setopt(hnd, CURLOPT_MAXREDIRS, 50L);
        curl_easy_setopt(hnd, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
        curl_easy_setopt(hnd, CURLOPT_FTP_SKIP_PASV_IP, 1L);
        curl_easy_setopt(hnd, CURLOPT_TCP_KEEPALIVE, 1L);
        curl_easy_setopt(hnd, CURLOPT_WRITEFUNCTION, write_callback);// The callback function to write data to.
        curl_easy_setopt(hnd, CURLOPT_WRITEDATA, (void *)output);// Send the address of the data struct to callback func.
        //curl_easy_setopt(hnd, CURLOPT_VERBOSE, 1);

        curl_multi_add_handle(mh, hnd);
    }else{
        output->memory[0] = '[=10=]';
    }    
    return NULL;// The output struct was passed by reference no need to return anything.
}

CURLM *SetUpMultiCurlHandle(){
    curl_global_init(CURL_GLOBAL_ALL);

    CURLM * mh = curl_multi_init();
    return mh;
}

void *PerformMultiCurl(CURLM * mh) 
/*Take in a preset multi handle, request data from the remote server asynchronously {it's assumed cURL is using threads transparent to the calling program}.
   Remove the handles from memory.*/
{
    CURLMsg *msg=NULL;
    CURL *hnd = NULL;
    CURLcode return_code = 0;
    int still_running = 0;
    int msgs_left = 0;

    curl_multi_perform(mh, &still_running);// Perform the requests.
    do {
        int numfds=0;
        int res = curl_multi_wait(mh, NULL, 0, MAX_WAIT_MSECS, &numfds);
        if(res != CURLM_OK) {
            fprintf(stderr, "error: curl_multi_wait() returned %d\n", res);
            return NULL;
        }
        curl_multi_perform(mh, &still_running);
        
       /* Without this loop the program will proceed to the next statement, most likely before the messages are retrieved from the server.
           The easy handle requests are conducted asynchronously, but one multi handle request is obviously conducted sequentially (can use pthreads to make asynchronous multi requests).*/
    } while(still_running); 
    
    
    /* This portion of the code will clean up and remove the handles from memory, you could change this to make them more persistent */
    while ((msg = curl_multi_info_read(mh, &msgs_left))) {
        if (msg->msg == CURLMSG_DONE) {
            hnd = msg->easy_handle;

            return_code = msg->data.result;
            if(return_code!=CURLE_OK) {
                fprintf(stderr, "CURL error code: %d\n", msg->data.result);
                continue;
            }

            curl_multi_remove_handle(mh, hnd);
            curl_easy_cleanup(hnd);
            hnd = NULL;
        }
        else {
            fprintf(stderr, "error: after curl_multi_info_read(), CURLMsg=%d\n", msg->msg);
        }
    }

    curl_multi_cleanup(mh);
    curl_global_cleanup();
    return NULL;
}

可以找到此程序的完整 UTF-8 变体

Answer 1

如您所料，它不有效。 libcurl 无法知道函数期望 wchar_t* 什么时候应该期望 char*

如果您检查 MyOutputStruct1.memory[0]，您会发现它不包含应有的内容。比如请求https://whosebug.com时，包含0x4f44213c。这显然是错误的，因为这远远超出了有效代码点的范围。这实际上是前四个代码点 (<!DO) 挤进一个 wchar_t（按 LE 顺序）。

由于第二个错误，它似乎可以工作。打印宽字符串时，需要使用%ls，而不是%s.

wprintf(L"Output:\n%s\n", MyOutputStruct1.memory);

应该是

printf("Output:\n%ls\n", MyOutputStruct1.memory);
// -or-
wprintf(L"Output:\n%ls\n", MyOutputStruct1.memory);

基本上，代码需要一个 char*。指针的类型是 wchar_t*，但到处都用作 char*。因此，这两个错误在有问题的程序中大多“抵消”了。（我没看，但我预计长度不能被 sizeof(wchar_t) 整除的输入有问题。）如果指针实际上被用作 wchar_t*（例如，如果它的元素有被检查或如果它已被传递给 w 函数），问题会很明显。

Answer 2

如评论部分所述，所有这一切真正需要的是一个 UTF-8 解析器。字符可以保存 UTF-8，但如果不将它们转换为某种其他数据类型，我们就无法轻松地单独寻址每个字符 [某些 UTF-8 字符大于 1 个字节]。所以我在libutf-8的帮助下写了一个解析器。

/* gcc unicode.c -o unicode -lutf-8 
This program makes use of libutf-8.
http://www.whizkidtech.redprince.net/i18n/
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <locale.h>
#include <utf-8.h>

int* parse_UTF8_bitstream( size_t *len, const char *input_stream )
/* Parse a UTF-8 char bytestream into a 4-byte wide integer bytestream [so we can address each UTF-8 character individually] */
{
    *len = 0; // This will give us the number of wide-characters not counting NULL.
    int i = 0;
    int n;
    unsigned int *output = malloc ( sizeof( unsigned int ) );
    unsigned int *temp;
    while ( input_stream[ i ] ){
        temp = ( unsigned int* ) realloc(output, (*len + 1) * sizeof( unsigned int ) );
        output = temp;
        output[ *len ] = sgetu8( (unsigned char *) &input_stream[ i ], &n );
        i+= n; //Skip this many chars to the next UTF-8 code.
        *len = *len + 1;
    }
    
    /* Make sure the last character is NULL */
    temp = ( unsigned int* ) realloc(output, (*len + 1) * sizeof( unsigned int ) );
    output = temp;
    output[ *len ] = 0;
    
    return (int*)output; // This is our wide character string.
}

void process_string(const char *s)
{
    printf("%s\n", s);
    printf("LENGTH: %lu 1-Byte Characters\n\n", strlen( s ));
  
    size_t len;
    int* outputstream = parse_UTF8_bitstream( &len, s );
    
    printf("\n%ls\n", outputstream);
    printf("LENGTH: %lu Wide Characters\n", len);
    for(int i = 0; i<len; i++){
        printf("%lc\n", outputstream[ i ]);
    }
    
    free ( outputstream );
}


int main(void)
/* "Hello World", in Arabic, Russian, and Greek */
{
    setlocale(LC_ALL, "");
    
    const char *string1 = "مرحبا بالعالم";
    const char *string2 = "Всем привет";
    const char *string3 = "Γεια σου κόσμε";
    process_string( string1 );
    process_string( string2 );
    process_string( string3 );
          
    return 0;
}

Answer 3

这与我之前发布的程序相同，但是，它不需要任何特殊的库。它使用标准库中的 mbtowc() 函数。

来自 mbtowc() 手册页：

#include <stdlib.h>

int mbtowc(wchar_t * restrict wcharp, const char * restrict mbchar, size_t nbytes);

If mbchar is NULL, the mbtowc() function returns nonzero if shift states are supported, zero otherwise.

Otherwise, if mbchar is not a null pointer, mbtowc() either returns 0 if mbchar represents the null wide character, or returns the number of bytes processed in mbchar, or returns -1 if no multibyte character could be recognized or converted. In this case, mbtowc()'s internal conversion state is undefined.

/* cc unicode.c -o unicode  */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <locale.h>


int parse_UTF8_bitstream(wchar_t **output_stream, const char *input_stream )
/*  Parse a UTF-8 char bytestream into a 4-byte wide wchar_t bytestream 
    [so we can address each UTF-8 character individually] 
    If this parser receives invalid UTF-8 code it will return -1. 
*/
{
    int len = 0; /* This will give us the number of wide-characters not counting NULL. */
    int i = 0; /* This iterates through the mb char stream. */
    int skip_value;
    int wc_size = sizeof( wchar_t ); /* The size of our destination datatype. */
    
    /* Initialize the output_stream */
    output_stream[ 0 ] = malloc ( 1 );
    wchar_t *temp;
    
    while ( input_stream[ i ] ){
        temp = ( wchar_t* ) realloc( output_stream[ 0 ], (len + 1) * wc_size );
        output_stream[ 0 ] = temp;
      
        skip_value = mbtowc( &output_stream[ 0 ][ len ], &input_stream[ i ], wc_size );
        if (skip_value == -1) return -1;
        
        /* i skips this many chars to the next UTF-8 code. */
        i += skip_value;
        len = len + 1;
    }
    
    /* Make sure the last wide-character is NULL */
    temp = ( wchar_t* ) realloc( output_stream[ 0 ], (len + 1) * wc_size );
    output_stream[ 0 ] = temp;
    output_stream[ 0 ][ len ] = 0;
    
    return len; /* This is the length of the wide character string. */
}

void process_string(const char *s)
{
    printf("\n%s\n", s);
    printf("LENGTH: %lu 1-Byte Characters\n\n", strlen( s ));
  
    wchar_t* outputstream = NULL;
    size_t len = parse_UTF8_bitstream( &outputstream, s );
    
    if( len == -1 ) { 
        printf("\nThe parser received invalid unicode.\n");
        free ( outputstream ); 
        exit ( EXIT_FAILURE ); 
    }
    
    printf("%ls\n", outputstream);
    printf("LENGTH: %lu 4-Byte Wide-Characters\n", len);
    
    for(int i = 0; i<len; i++){
        printf("%lc\n", outputstream[ i ]);
    }
    
    free ( outputstream );
}


int main ( void )
/*  "Hello World" in Arabic, Russian, Greek, Georgian,
    Japanese, Chinese, and Korean.  
    
    I added an emoji string as an illustration [these 
    appear to be larger than 4 bytes each, they're probably 
    multiple unicode scalar values combined into a glyph].
    
    For Asian and emoji characters to work you need appropriate fonts.
    The noto-2.0 meta pkg on FreeBSD installs Asian and emoji fonts, 
    it's also available on Linux.  
*/
{
    setlocale(LC_ALL, "");
    
    const char *string1 = "مرحبا بالعالم";
    const char *string2 = "Всем привет";
    const char *string3 = "Γεια σου κόσμε";
    const char *string4 = "გამარჯობა სამყაროვ";
    const char *string5 = "ハローワールド";
    const char *string6 = "世界您好";
    const char *string7 = "전 세계 여러분 안녕하세요";
    const char *string8 = "️️️️";
    process_string( string1 );
    process_string( string2 );
    process_string( string3 );
    process_string( string4 );
    process_string( string5 );
    process_string( string6 );
    process_string( string7 ); 
    process_string( string8 );
             
    return 0;
    /* Don't use this code to violate anyone. */
}

在 cURL 中使用 UTF-8 数据流，数据流何时从 UTF-8 转换为 ASCII？

Using UTF-8 Datastreams in cURL, when is the datastream converted from UTF-8 to ASCII?

c

utf-8

libcurl