C 中的 BOM 字节序
BOM endian in C
我知道 Big Endian 和 Little Endian 可以使用 BOM 进行编码,从而将它们泄露出去,但我很困惑如何在给定文件的情况下在 C 中对其进行评估。
00 00 FE FF -> UTF-32, big-endian
FF FE 00 00 -> UTF-32, little-endian
FE FF -> UTF-16, big-endian
FF FE -> UTF-16, little-endian
我有这段代码可以从文件中获取字节,但是假设文件以 BOM \xFF\xFE 或 [=21= 开头,我怎么知道它是小端还是大端 UTF-16 ].
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]){
unsigned char c;
FILE *f = fopen(argv[1], "r");
while (fread(&c, sizeof(char), 1, f) == 1){
fprintf(stdout, "%x\n", c);
}
}
包含此 BOM 的文件是什么样的? (在字节或常规文本中)?
我希望有人能帮帮忙。谢谢
我很困惑如何读取文件并测试包含 BOM 的第一个或多个字节是小端还是大端?我该怎么做?
也许有比这更巧妙的方法,但它似乎有效:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char *argv[])
{
unsigned char c[4];
if (argc != 2)
{
fprintf(stderr, "Usage: %s file\n", argv[0]);
return EXIT_FAILURE;
}
FILE *f = fopen(argv[1], "rb"); // b for Windows; a no-op on Unix
if (f == 0)
{
fprintf(stderr, "%s: failed to open file %s for reading\n",
argv[0], argv[1]);
return EXIT_FAILURE;
}
size_t nbytes = fread(c, sizeof(char), sizeof(c), f);
fclose(f);
if (nbytes == 2)
{
/* UTF16 with BOM only? */
if (c[0] == 0xFE && c[1] == 0xFF)
printf("UTF-16BE\n");
else if (c[0] == 0xFF && c[1] == 0xFE)
printf("UTF-16LE\n");
else
printf("Two random (non-UTF) bytes 0x%.2X and 0x%.2X\n", c[0], c[1]);
}
else if (nbytes == 4)
{
if (memcmp(c, "\xFF\xFE\x00\x00", 4) == 0)
printf("UTF-32LE\n");
else if (memcmp(c, "\x00\x00\xFE\xFF", 4) == 0)
printf("UTF-32BE\n");
else if (memcmp(c, "\xFE\xFF", 2) == 0)
printf("UTF-16BE\n");
else if (memcmp(c, "\xFF\xFE", 2) == 0)
printf("UTF-16LE\n");
else
printf("Four random (non-UTF) bytes 0x%.2X, 0x%.2X, 0x%.2X, 0x%.2X\n",
c[0], c[1], c[2], c[3]);
}
else
{
fprintf(stderr, "%s: Odd-ball data size %zu (expected 2 or 3) - no diagnosis\n", argv[0], nbytes);
return EXIT_FAILURE;
}
return 0;
}
我使用了一些自定义程序来创建我测试的数据,但结果对我来说足够有说服力:
$ cat chk.sh
for file in utf-*
do
ls -l $file
odx $file | sed 2d
printf 'File: %-12s - content: %s\n' $file $(utf61 $file)
done
$ sh chk.sh
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-16BE
0x0000: FE FF 00 30 ...0
File: utf-16BE - content: UTF-16BE
-rw-r--r-- 1 jleffler rd 2 Sep 19 15:01 utf-16BE.2
0x0000: FE FF ..
File: utf-16BE.2 - content: UTF-16BE
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-16LE
0x0000: FF FE 30 00 ..0.
File: utf-16LE - content: UTF-16LE
-rw-r--r-- 1 jleffler rd 2 Sep 19 15:01 utf-16LE.2
0x0000: FF FE ..
File: utf-16LE.2 - content: UTF-16LE
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-32BE
0x0000: 00 00 FE FF ....
File: utf-32BE - content: UTF-32BE
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-32LE
0x0000: FF FE 00 00 ....
File: utf-32LE - content: UTF-32LE
$
这是我正在为您做的,抱歉花了这么长时间:
#include <stdio.h>
#include <string.h>
int main(void)
{
char bom[4];
FILE *fp = fopen("file.txt", "rb");
if (fp == NULL)
{
perror("fopen()");
return 1; /* or EXIT_FAILURE, but would need <stdlib.h> */
}
if (fread(bom, 1, 4, fp) < 2 && feof(fp) || ferror(fp))
{
fprintf(stderr, "Error occurred with fread() or file malformed.\n");
return 1;
}
if (memcmp(bom, "\x00\x00\xFE\xFF", 4) == 0)
printf("UTF-32, big-endian.\n");
else if (memcmp(bom, "\xFF\xFE\x00\x00", 4) == 0)
printf("UTF-32, little-endian.\n");
else if (memcmp(bom, "\xFE\xFF", 2) == 0)
printf("UTF-16, big-endian.\n");
else if (memcmp(bom, "\xFF\xFE", 2) == 0)
printf("UTF-16, little-endian.\n");
else
{
fprintf(stderr, "Malformed BOM.\n");
return 1;
}
fclose(fp);
return 0;
}
C Reading file with BOM to test if UTF16 LE or BE
OP似乎也想区分其他的:UTF-32、BE和UTF-32 LE
一定要以二进制模式打开文件。要以 text 模式打开,BOM 可能会被 fopen()
消耗并且在后续读取操作中看不到。
有多种BOM编码可以区分。 This lists 10+。例如,让我们使用:UTF32BE、UTF16LE、UTF8。这些可以通过读取几个固定数量的字符或通过一次读取 1 个字节来确定。提示:它被称为 "byte" 订单标记。需要小心处理非常短的文件。
而不是将候选 BOM 放入 代码,下面将它们放入 数据结构 以便清晰、易于扩展和维护。
#define BOM_MAX_LEN 5
const char *BOM_Name(FILE *f) {
static const struct BOM {
char *name;
size_t length;
unsigned char signature[BOM_MAX_LEN];
} BOM[] = { // Various UTF encodings
{ "UTF8", 3, { 0xEF, 0xBB, 0xBF } }, // UTF8
{ "UTF16LE", 2, { 0xFF, 0xFE } }, // UTF16LE
{ "UTF32BE", 4, { 0x00, 0x00, 0xFE, 0xFF } }, // UTF32BE
// Add others as desired. https://en.wikipedia.org/wiki/Byte_order_mark
{ NULL, 0, { 0 } } };
unsigned char BOM_Signature[BOM_MAX_LEN];
rewind(f); // Only file beginning
size_t length = fread(BOM_Signature, 1, BOM_MAX_LEN, f);
for (size_t i = 0; BOM[i].length; i++) {
if (length >= BOM[i].length
&& memcmp(BOM_Signature, BOM[i].signature, BOM[i].length) == 0) {
fseek(f, BOM[i].length, SEEK_SET); // Leave file position to just after BOM
return BOM[i].name;
}
}
return NULL;
}
注意 提到的非唯一性冲突:1) UTF16-LE BOM 与以下 16 位 0 对比 2) UTF32-LE BOM。此方法将接受首先测试的任何 BOM 编码。我建议在 UTF16-LE 之前列出 UTF32-LE,否则它永远不会被检测到。一个强大的解决方案将更深入地测试文件以尝试解决方案。 (未在此答案中显示)
我知道 Big Endian 和 Little Endian 可以使用 BOM 进行编码,从而将它们泄露出去,但我很困惑如何在给定文件的情况下在 C 中对其进行评估。
00 00 FE FF -> UTF-32, big-endian
FF FE 00 00 -> UTF-32, little-endian
FE FF -> UTF-16, big-endian
FF FE -> UTF-16, little-endian
我有这段代码可以从文件中获取字节,但是假设文件以 BOM \xFF\xFE 或 [=21= 开头,我怎么知道它是小端还是大端 UTF-16 ].
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]){
unsigned char c;
FILE *f = fopen(argv[1], "r");
while (fread(&c, sizeof(char), 1, f) == 1){
fprintf(stdout, "%x\n", c);
}
}
包含此 BOM 的文件是什么样的? (在字节或常规文本中)? 我希望有人能帮帮忙。谢谢
我很困惑如何读取文件并测试包含 BOM 的第一个或多个字节是小端还是大端?我该怎么做?
也许有比这更巧妙的方法,但它似乎有效:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char *argv[])
{
unsigned char c[4];
if (argc != 2)
{
fprintf(stderr, "Usage: %s file\n", argv[0]);
return EXIT_FAILURE;
}
FILE *f = fopen(argv[1], "rb"); // b for Windows; a no-op on Unix
if (f == 0)
{
fprintf(stderr, "%s: failed to open file %s for reading\n",
argv[0], argv[1]);
return EXIT_FAILURE;
}
size_t nbytes = fread(c, sizeof(char), sizeof(c), f);
fclose(f);
if (nbytes == 2)
{
/* UTF16 with BOM only? */
if (c[0] == 0xFE && c[1] == 0xFF)
printf("UTF-16BE\n");
else if (c[0] == 0xFF && c[1] == 0xFE)
printf("UTF-16LE\n");
else
printf("Two random (non-UTF) bytes 0x%.2X and 0x%.2X\n", c[0], c[1]);
}
else if (nbytes == 4)
{
if (memcmp(c, "\xFF\xFE\x00\x00", 4) == 0)
printf("UTF-32LE\n");
else if (memcmp(c, "\x00\x00\xFE\xFF", 4) == 0)
printf("UTF-32BE\n");
else if (memcmp(c, "\xFE\xFF", 2) == 0)
printf("UTF-16BE\n");
else if (memcmp(c, "\xFF\xFE", 2) == 0)
printf("UTF-16LE\n");
else
printf("Four random (non-UTF) bytes 0x%.2X, 0x%.2X, 0x%.2X, 0x%.2X\n",
c[0], c[1], c[2], c[3]);
}
else
{
fprintf(stderr, "%s: Odd-ball data size %zu (expected 2 or 3) - no diagnosis\n", argv[0], nbytes);
return EXIT_FAILURE;
}
return 0;
}
我使用了一些自定义程序来创建我测试的数据,但结果对我来说足够有说服力:
$ cat chk.sh
for file in utf-*
do
ls -l $file
odx $file | sed 2d
printf 'File: %-12s - content: %s\n' $file $(utf61 $file)
done
$ sh chk.sh
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-16BE
0x0000: FE FF 00 30 ...0
File: utf-16BE - content: UTF-16BE
-rw-r--r-- 1 jleffler rd 2 Sep 19 15:01 utf-16BE.2
0x0000: FE FF ..
File: utf-16BE.2 - content: UTF-16BE
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-16LE
0x0000: FF FE 30 00 ..0.
File: utf-16LE - content: UTF-16LE
-rw-r--r-- 1 jleffler rd 2 Sep 19 15:01 utf-16LE.2
0x0000: FF FE ..
File: utf-16LE.2 - content: UTF-16LE
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-32BE
0x0000: 00 00 FE FF ....
File: utf-32BE - content: UTF-32BE
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-32LE
0x0000: FF FE 00 00 ....
File: utf-32LE - content: UTF-32LE
$
这是我正在为您做的,抱歉花了这么长时间:
#include <stdio.h>
#include <string.h>
int main(void)
{
char bom[4];
FILE *fp = fopen("file.txt", "rb");
if (fp == NULL)
{
perror("fopen()");
return 1; /* or EXIT_FAILURE, but would need <stdlib.h> */
}
if (fread(bom, 1, 4, fp) < 2 && feof(fp) || ferror(fp))
{
fprintf(stderr, "Error occurred with fread() or file malformed.\n");
return 1;
}
if (memcmp(bom, "\x00\x00\xFE\xFF", 4) == 0)
printf("UTF-32, big-endian.\n");
else if (memcmp(bom, "\xFF\xFE\x00\x00", 4) == 0)
printf("UTF-32, little-endian.\n");
else if (memcmp(bom, "\xFE\xFF", 2) == 0)
printf("UTF-16, big-endian.\n");
else if (memcmp(bom, "\xFF\xFE", 2) == 0)
printf("UTF-16, little-endian.\n");
else
{
fprintf(stderr, "Malformed BOM.\n");
return 1;
}
fclose(fp);
return 0;
}
C Reading file with BOM to test if UTF16 LE or BE
OP似乎也想区分其他的:UTF-32、BE和UTF-32 LE
一定要以二进制模式打开文件。要以 text 模式打开,BOM 可能会被 fopen()
消耗并且在后续读取操作中看不到。
有多种BOM编码可以区分。 This lists 10+。例如,让我们使用:UTF32BE、UTF16LE、UTF8。这些可以通过读取几个固定数量的字符或通过一次读取 1 个字节来确定。提示:它被称为 "byte" 订单标记。需要小心处理非常短的文件。
而不是将候选 BOM 放入 代码,下面将它们放入 数据结构 以便清晰、易于扩展和维护。
#define BOM_MAX_LEN 5
const char *BOM_Name(FILE *f) {
static const struct BOM {
char *name;
size_t length;
unsigned char signature[BOM_MAX_LEN];
} BOM[] = { // Various UTF encodings
{ "UTF8", 3, { 0xEF, 0xBB, 0xBF } }, // UTF8
{ "UTF16LE", 2, { 0xFF, 0xFE } }, // UTF16LE
{ "UTF32BE", 4, { 0x00, 0x00, 0xFE, 0xFF } }, // UTF32BE
// Add others as desired. https://en.wikipedia.org/wiki/Byte_order_mark
{ NULL, 0, { 0 } } };
unsigned char BOM_Signature[BOM_MAX_LEN];
rewind(f); // Only file beginning
size_t length = fread(BOM_Signature, 1, BOM_MAX_LEN, f);
for (size_t i = 0; BOM[i].length; i++) {
if (length >= BOM[i].length
&& memcmp(BOM_Signature, BOM[i].signature, BOM[i].length) == 0) {
fseek(f, BOM[i].length, SEEK_SET); // Leave file position to just after BOM
return BOM[i].name;
}
}
return NULL;
}
注意