C 使用 libconv 将 iso−8859-1 转换为 UTF-8
C convert iso ISO−8859-1 to UTF-8 with libconv
我需要将拉丁语 ISO−8859-1 字符编码为 UTF-8(以及反向操作)。
我第一次使用这个答案 Is there a way to convert from UTF8 to iso-8859-1? 来执行操作并且有效;
现在我想使用 libiconv,它提供了所有转换机制,应该可以帮助我简化代码。
我已经按照此处提供的示例进行操作:
https://www.lemoda.net/c/iconv-example/iconv-example.html
并且我写了一个如下所示的方法:
char *iconvISO2UTF8(char *iso) {
iconv_t iconvDesc = iconv_open ("ISO−8859-1", "UTF-8//TRANSLIT//IGNORE");
if (iconvDesc == (iconv_t) - 1) {
/* Something went wrong. */
if (errno == EINVAL)
fprintf(stderr, "conversion from '%s' to '%s' not available", "ISO−8859−1", "UTF-8");
else
fprintf(stderr, "LibIcon initialization failure");
return NULL;
}
size_t iconv_value;
char * utf8;
size_t len;
size_t utf8len;
char * utf8start;
int len_start;
len = strlen (iso);
if (! len) {
fprintf(stderr, "iconvISO2UTF8: input String is empty.");
return NULL;
}
/* Assign enough space to put the UTF-8. */
utf8len = 2 * len;
utf8 = calloc (utf8len, sizeof (char));
if (! utf8) {
fprintf(stderr, "iconvISO2UTF8: Calloc failed.");
return NULL;
}
/* Keep track of the variables. */
utf8start = utf8;
len_start = len;
iconv_value = iconv (iconvDesc, & iso, & len, & utf8, & utf8len);
/* Handle failures. */
if (iconv_value == (size_t) - 1) {
switch (errno) {
/* See "man 3 iconv" for an explanation. */
case EILSEQ:
fprintf(stderr, "iconv failed: Invalid multibyte sequence, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);
break;
case EINVAL:
fprintf(stderr, "iconv failed: Incomplete multibyte sequence, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);
break;
case E2BIG:
fprintf(stderr, "iconv failed: No more room, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);
break;
default:
fprintf(stderr, "iconv failed, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);
}
return NULL;
}
if(iconv_close (iconvDesc) != 0) {
fprintf(stderr, "libicon close failed: %s", strerror (errno));
}
return utf8start;
}
当我用普通的类似 ascii 的旧字符调用此函数时,如 "abracadabra",iconv 起作用。
但是一旦我向它发送强调的字符,比如 "éàèüöä' 然后 iconv() 调用失败并显示 EILSEQ 代码:
iconv failed: Invalid multibyte sequence, in string 'éàèüöä', length
6, out string '', length 12
这是一个示例主程序,当存储在用 ISO−8859-1 编码的源文件中并在 linux 系统上以 ISO−8859-1 作为默认字符集编译时崩溃:
int main(int argc, char **argv) {
char *iso1 = "abracadabra";
char *utf = iconvISO2UTF8(iso1);
puts(utf);
free(utf);
char *iso2 = "éàèüöä";
utf = iconvISO2UTF8(iso2);
puts(utf);
free(utf);
}
是否可以 运行 这种使用 iconv 的转换?
如果是,这段代码有什么问题?
请仔细阅读iconv_open(3)
手册页:
iconv_t iconv_open(const char *tocode, const char *fromcode);
如果您要将 从 ISO 8859-1 转换为 UTF-8,那么这是不一致的:
iconv_t iconvDesc = iconv_open ("ISO−8859-1", "UTF-8//TRANSLIT//IGNORE");
应该说
iconv_t iconvDesc = iconv_open ("UTF-8//TRANSLIT//IGNORE", "ISO−8859-1");
我需要将拉丁语 ISO−8859-1 字符编码为 UTF-8(以及反向操作)。
我第一次使用这个答案 Is there a way to convert from UTF8 to iso-8859-1? 来执行操作并且有效;
现在我想使用 libiconv,它提供了所有转换机制,应该可以帮助我简化代码。
我已经按照此处提供的示例进行操作: https://www.lemoda.net/c/iconv-example/iconv-example.html
并且我写了一个如下所示的方法:
char *iconvISO2UTF8(char *iso) {
iconv_t iconvDesc = iconv_open ("ISO−8859-1", "UTF-8//TRANSLIT//IGNORE");
if (iconvDesc == (iconv_t) - 1) {
/* Something went wrong. */
if (errno == EINVAL)
fprintf(stderr, "conversion from '%s' to '%s' not available", "ISO−8859−1", "UTF-8");
else
fprintf(stderr, "LibIcon initialization failure");
return NULL;
}
size_t iconv_value;
char * utf8;
size_t len;
size_t utf8len;
char * utf8start;
int len_start;
len = strlen (iso);
if (! len) {
fprintf(stderr, "iconvISO2UTF8: input String is empty.");
return NULL;
}
/* Assign enough space to put the UTF-8. */
utf8len = 2 * len;
utf8 = calloc (utf8len, sizeof (char));
if (! utf8) {
fprintf(stderr, "iconvISO2UTF8: Calloc failed.");
return NULL;
}
/* Keep track of the variables. */
utf8start = utf8;
len_start = len;
iconv_value = iconv (iconvDesc, & iso, & len, & utf8, & utf8len);
/* Handle failures. */
if (iconv_value == (size_t) - 1) {
switch (errno) {
/* See "man 3 iconv" for an explanation. */
case EILSEQ:
fprintf(stderr, "iconv failed: Invalid multibyte sequence, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);
break;
case EINVAL:
fprintf(stderr, "iconv failed: Incomplete multibyte sequence, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);
break;
case E2BIG:
fprintf(stderr, "iconv failed: No more room, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);
break;
default:
fprintf(stderr, "iconv failed, in string '%s', length %d, out string '%s', length %d\n", iso, (int) len, utf8start, (int) utf8len);
}
return NULL;
}
if(iconv_close (iconvDesc) != 0) {
fprintf(stderr, "libicon close failed: %s", strerror (errno));
}
return utf8start;
}
当我用普通的类似 ascii 的旧字符调用此函数时,如 "abracadabra",iconv 起作用。 但是一旦我向它发送强调的字符,比如 "éàèüöä' 然后 iconv() 调用失败并显示 EILSEQ 代码:
iconv failed: Invalid multibyte sequence, in string 'éàèüöä', length 6, out string '', length 12
这是一个示例主程序,当存储在用 ISO−8859-1 编码的源文件中并在 linux 系统上以 ISO−8859-1 作为默认字符集编译时崩溃:
int main(int argc, char **argv) {
char *iso1 = "abracadabra";
char *utf = iconvISO2UTF8(iso1);
puts(utf);
free(utf);
char *iso2 = "éàèüöä";
utf = iconvISO2UTF8(iso2);
puts(utf);
free(utf);
}
是否可以 运行 这种使用 iconv 的转换? 如果是,这段代码有什么问题?
请仔细阅读iconv_open(3)
手册页:
iconv_t iconv_open(const char *tocode, const char *fromcode);
如果您要将 从 ISO 8859-1 转换为 UTF-8,那么这是不一致的:
iconv_t iconvDesc = iconv_open ("ISO−8859-1", "UTF-8//TRANSLIT//IGNORE");
应该说
iconv_t iconvDesc = iconv_open ("UTF-8//TRANSLIT//IGNORE", "ISO−8859-1");