在C中将整数的二进制文件读取为十进制和十六进制值

Reading binary file of integers as decimal and hexadecimal values in C

我正在尝试读取 C 中的二进制文件 datafile。该二进制文件显然包含 32 位(4 字节)整数。我被告知二进制文件是使用以下代码片段生成的:

#include <fcntl.h>
#include <stdint.h>
#include <unistd.h>
.....
    int  fd = open("datafile", O_CREAT|O_WRONLY, 0600);

    if(fd >= 0) {        //  IFF FILE OPENED SUCCESSFULLY
        for(int32_t i = -50 ; i<50 ; i++) {
            write(fd, &i, sizeof(i));
        }
        close(fd);
    }

我得到了以下用于读取二进制文件的代码:

#include <stdio.h>
#include <fcntl.h>
#include <stdint.h>
#include <unistd.h>

int main(void) {

        int fd = open("datafile", O_RDONLY, 0);

        if(fd >= 0) {
                for(int32_t i = -50; i < 50; i++) {
                        int32_t value;

                        read(fd, &value, sizeof(value));
                        printf("%4i\t0x%08x\t%10i\t0x%08x\n", i, i, value, value);
                }
                close(fd);
        }
return 0;
}

有人告诉我这段代码应该将二进制文件的值打印为十进制和十六进制值。我的输出如下:

 -50    0xffffffce         255  0x000000ff
 -49    0xffffffcf         255  0x000000ff
 -48    0xffffffd0         255  0x000000ff
 -47    0xffffffd1         255  0x000000ff
 -46    0xffffffd2         255  0x000000ff
 -45    0xffffffd3         255  0x000000ff
 -44    0xffffffd4         255  0x000000ff
 -43    0xffffffd5         255  0x000000ff
 -42    0xffffffd6         255  0x000000ff
 -41    0xffffffd7         255  0x000000ff
 -40    0xffffffd8         255  0x000000ff
 -39    0xffffffd9         255  0x000000ff
 -38    0xffffffda         255  0x000000ff
 -37    0xffffffdb         255  0x000000ff
 -36    0xffffffdc         255  0x000000ff
 -35    0xffffffdd         255  0x000000ff
 -34    0xffffffde         255  0x000000ff
 -33    0xffffffdf         255  0x000000ff
 -32    0xffffffe0         255  0x000000ff
 -31    0xffffffe1         255  0x000000ff
 -30    0xffffffe2         255  0x000000ff
 -29    0xffffffe3         255  0x000000ff
 -28    0xffffffe4         255  0x000000ff
 -27    0xffffffe5         255  0x000000ff
 -26    0xffffffe6         255  0x000000ff
 -25    0xffffffe7         255  0x000000ff
 -24    0xffffffe8         255  0x000000ff
 -23    0xffffffe9         255  0x000000ff
 -22    0xffffffea         255  0x000000ff
 -21    0xffffffeb         255  0x000000ff
 -20    0xffffffec         255  0x000000ff
 -19    0xffffffed         255  0x000000ff
 -18    0xffffffee         255  0x000000ff
 -17    0xffffffef         255  0x000000ff
 -16    0xfffffff0         255  0x000000ff
 -15    0xfffffff1         255  0x000000ff
 -14    0xfffffff2         255  0x000000ff
 -13    0xfffffff3         255  0x000000ff
 -12    0xfffffff4         255  0x000000ff
 -11    0xfffffff5         255  0x000000ff
 -10    0xfffffff6         255  0x000000ff
  -9    0xfffffff7         255  0x000000ff
  -8    0xfffffff8         255  0x000000ff
  -7    0xfffffff9         255  0x000000ff
  -6    0xfffffffa         255  0x000000ff
  -5    0xfffffffb         255  0x000000ff
  -4    0xfffffffc         255  0x000000ff
  -3    0xfffffffd         255  0x000000ff
  -2    0xfffffffe         255  0x000000ff
  -1    0xffffffff         255  0x000000ff
   0    0x00000000         255  0x000000ff
   1    0x00000001         255  0x000000ff
   2    0x00000002         255  0x000000ff
   3    0x00000003         255  0x000000ff
   4    0x00000004         255  0x000000ff
   5    0x00000005         255  0x000000ff
   6    0x00000006         255  0x000000ff
   7    0x00000007         255  0x000000ff
   8    0x00000008         255  0x000000ff
   9    0x00000009         255  0x000000ff
  10    0x0000000a         255  0x000000ff
  11    0x0000000b         255  0x000000ff
  12    0x0000000c         255  0x000000ff
  13    0x0000000d         255  0x000000ff
  14    0x0000000e         255  0x000000ff
  15    0x0000000f         255  0x000000ff
  16    0x00000010         255  0x000000ff
  17    0x00000011         255  0x000000ff
  18    0x00000012         255  0x000000ff
  19    0x00000013         255  0x000000ff
  20    0x00000014         255  0x000000ff
  21    0x00000015         255  0x000000ff
  22    0x00000016         255  0x000000ff
  23    0x00000017         255  0x000000ff
  24    0x00000018         255  0x000000ff
  25    0x00000019         255  0x000000ff
  26    0x0000001a         255  0x000000ff
  27    0x0000001b         255  0x000000ff
  28    0x0000001c         255  0x000000ff
  29    0x0000001d         255  0x000000ff
  30    0x0000001e         255  0x000000ff
  31    0x0000001f         255  0x000000ff
  32    0x00000020         255  0x000000ff
  33    0x00000021         255  0x000000ff
  34    0x00000022         255  0x000000ff
  35    0x00000023         255  0x000000ff
  36    0x00000024         255  0x000000ff
  37    0x00000025         255  0x000000ff
  38    0x00000026         255  0x000000ff
  39    0x00000027         255  0x000000ff
  40    0x00000028         255  0x000000ff
  41    0x00000029         255  0x000000ff
  42    0x0000002a         255  0x000000ff
  43    0x0000002b         255  0x000000ff
  44    0x0000002c         255  0x000000ff
  45    0x0000002d         255  0x000000ff
  46    0x0000002e         255  0x000000ff
  47    0x0000002f         255  0x000000ff
  48    0x00000030         255  0x000000ff
  49    0x00000031         255  0x000000ff

但实际输出应该是这样的:

 -50    0xffffffce      -822083585      0xceffffff
 -49    0xffffffcf      -805306369      0xcfffffff
 -48    0xffffffd0      -788529153      0xd0ffffff
.....
  -3    0xfffffffd       -33554433      0xfdffffff
  -2    0xfffffffe       -16777217      0xfeffffff
  -1    0xffffffff              -1      0xffffffff
   0    0x00000000               0      0x00000000
   1    0x00000001        16777216      0x01000000
   2    0x00000002        33554432      0x02000000
   3    0x00000003        50331648      0x03000000
.....
  47    0x0000002f       788529152      0x2f000000
  48    0x00000030       805306368      0x30000000
  49    0x00000031       822083584      0x31000000

如您所见,尽管我的前两列似乎符合预期,但最后两列与它们应有的完全不同。我的最后两列只是重复打印出相同的两个值 2550x000000ff。这是怎么回事,我该如何解决?

正在将评论转化为答案。

JL: I don't see how you can get the 'expected output' from the input data if the file was written on the same machine that it is being read on. There seems to be an expectation that the file was generated on a little-endian machine and read on a big-endian machine or vice versa.

TP: I've never heard of this "endian" concept. I am using macOS, if that helps.

JL: Do you know who created the data file? Was it you on your Mac, or did someone else (an instructor) create it? Do you know what machine they created it on? You can find out about 'endian-ness' on Wikipedia (where else?) at Endianness. Your Mac is likely using an Intel chip and therefore uses little-endian order. AFAIK, even the M1 Macs use little-endian. But SPARC and PowerPC use big-endian order (though PowerPC is now switchable, and there is a PPC-LE version of Linux).

TP: It seems that the binary file may have been created on (the code may have been executed on) a "Sun SPARC computer." But it was implied that reading this binary file should work on Intel 64-bit x86 processors (there is no expectation that we should be using Sun SPARC computers, obviously).

JL: As I added to my previous comment, SPARC is a big-endian machine. So, you now need to find out why value is not being set by the read operation. Checking the return value from read() might be informative — if it isn't 4, there are problems. And yes, you can read the data on your Mac without problems, and you should then get something like the expected output.

TP: I'm only a beginner systems programmer, so this is very much outside of what I know. According to POSIX read(), read() returns an int, so I captured that value in a variable and printed it out. It prints out 0, which, according to the documentation, seems to be as expected. Why would we expect it to be 4?

read() 函数被告知最多读取 N 个字节,returns 是它实际读取的字节数。零字节读取意味着“没有更多数据”,俗称 EOF 或“文件结尾”。当您要求它读取 4 个字节时,它应该 return 4 除非文件中没有剩下那么多数据。

JL: Do a hex dump of datafile: xxd datafile | sed 1q should produce 00000000: ffff ffce ffff ffcf ffff ffd0 ffff ffd1 (plus a series of 16 dots). If it doesn't you've got corrupted data. I wrote a program on my Mac to write the data in big-endian format (I work with a DBMS that uses big-endian format on disk, so I have the tools around), and then your reading code produces the expected output. So, I'm very puzzled about why you're getting the 255/0x000000ff output. Have you rechecked that you've not deleted any characters from your reading code. And do check the return value from read().

JL: Just to be clear: with the big-endian data file and the reading code in the question, I get the expected output on my Mac (this one is running Big Sur 11.6.5).

TP: As I said, read returned 0, so I think that's as expected. Using xxd datafile | sed 1q, I got 00000000: ff .. So a small number of characters, then a lot of whitespace, and then a .. It seems very odd to me.

JL: Your data file is corrupted. It should have 400 bytes (ls -l datafile). And read() should only return 0 when it reaches the end of the file; every other time, it should return 4 (sizeof(int32_t) or sizeof(value)). You get 255 because there is a single byte with value 0xFF aka 255 (and the computing gods smiled on you — or maybe cursed you — and set the other bytes of value to 0).

TP: You're right! The file size was 400 bytes when I downloaded it, but it is now 1 byte! I have no idea how that happened. I have now re-downloaded it, and it seems to be working as expected! […]

两节课

  1. 检查 return 值 — 特别是来自 I/O 函数。
  2. 检查您的数据文件(ls -lxxd 等)以确保它们包含您期望的内容。

wr71le.c

这写在 'native order' 中,在 Intel 机器上是 little-endian。

#include <fcntl.h>
#include <stdint.h>
#include <unistd.h>

int main(void)
{
    int fd = open("datafile", O_CREAT | O_WRONLY, 0600);

    if (fd >= 0)
    {
        for (int32_t i = -50; i < 50; i++)
        {
            write(fd, &i, sizeof(i));
        }
        close(fd);
    }
    return 0;
}

wr71be.c

这在任何类型的机器上以 big-endian 顺序写入 — big-endian 或 little-endian。

#include <fcntl.h>
#include <stdint.h>
#include <unistd.h>

typedef int32_t Sint4;    // The st_sint4() code uses this type name

/*
**  Convert signed 4-byte signed integer into 4-byte character array.
*/
static void st_sint4(Sint4 l, char *s)
{
    s += sizeof(Sint4) - 1;
    *s-- = l & 0xFF;
    l >>= 8;
    *s-- = l & 0xFF;
    l >>= 8;
    *s-- = l & 0xFF;
    l >>= 8;
    *s   = l & 0xFF;
}

int main(void)
{
    int fd = open("datafile", O_CREAT | O_WRONLY, 0600);

    if (fd >= 0)
    {
        for (int32_t i = -50; i < 50; i++)
        {
            char data[4];
            st_sint4(i, data);
            write(fd, data, sizeof(data));
        }
        close(fd);
    }
    return 0;
}

来自xxd datafile

的输出
00000000: ffff ffce ffff ffcf ffff ffd0 ffff ffd1  ................
00000010: ffff ffd2 ffff ffd3 ffff ffd4 ffff ffd5  ................
00000020: ffff ffd6 ffff ffd7 ffff ffd8 ffff ffd9  ................
00000030: ffff ffda ffff ffdb ffff ffdc ffff ffdd  ................
00000040: ffff ffde ffff ffdf ffff ffe0 ffff ffe1  ................
00000050: ffff ffe2 ffff ffe3 ffff ffe4 ffff ffe5  ................
00000060: ffff ffe6 ffff ffe7 ffff ffe8 ffff ffe9  ................
00000070: ffff ffea ffff ffeb ffff ffec ffff ffed  ................
00000080: ffff ffee ffff ffef ffff fff0 ffff fff1  ................
00000090: ffff fff2 ffff fff3 ffff fff4 ffff fff5  ................
000000a0: ffff fff6 ffff fff7 ffff fff8 ffff fff9  ................
000000b0: ffff fffa ffff fffb ffff fffc ffff fffd  ................
000000c0: ffff fffe ffff ffff 0000 0000 0000 0001  ................
000000d0: 0000 0002 0000 0003 0000 0004 0000 0005  ................
000000e0: 0000 0006 0000 0007 0000 0008 0000 0009  ................
000000f0: 0000 000a 0000 000b 0000 000c 0000 000d  ................
00000100: 0000 000e 0000 000f 0000 0010 0000 0011  ................
00000110: 0000 0012 0000 0013 0000 0014 0000 0015  ................
00000120: 0000 0016 0000 0017 0000 0018 0000 0019  ................
00000130: 0000 001a 0000 001b 0000 001c 0000 001d  ................
00000140: 0000 001e 0000 001f 0000 0020 0000 0021  ........... ...!
00000150: 0000 0022 0000 0023 0000 0024 0000 0025  ..."...#...$...%
00000160: 0000 0026 0000 0027 0000 0028 0000 0029  ...&...'...(...)
00000170: 0000 002a 0000 002b 0000 002c 0000 002d  ...*...+...,...-
00000180: 0000 002e 0000 002f 0000 0030 0000 0031  ......./...0...1

rd71.c

这按本机字节顺序读取。

#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

int main(void)
{
    int fd = open("datafile", O_RDONLY, 0);

    if (fd >= 0)
    {
        for (int32_t i = -50; i < 50; i++)
        {
            int32_t value;

            if (read(fd, &value, sizeof(value)) != sizeof(value))
            {
                fprintf(stderr, "Faulty read!\n");
                exit(EXIT_FAILURE);
            }
            printf("%4i\t0x%08x\t%10i\t0x%08x\n", i, i, value, value);
        }
        close(fd);
    }
    return 0;
}

来自rd71

的输出
 -50    0xffffffce  -822083585  0xceffffff
 -49    0xffffffcf  -805306369  0xcfffffff
 -48    0xffffffd0  -788529153  0xd0ffffff
 -47    0xffffffd1  -771751937  0xd1ffffff
 -46    0xffffffd2  -754974721  0xd2ffffff
 -45    0xffffffd3  -738197505  0xd3ffffff
 -44    0xffffffd4  -721420289  0xd4ffffff
 -43    0xffffffd5  -704643073  0xd5ffffff
 -42    0xffffffd6  -687865857  0xd6ffffff

…snip…

  40    0x00000028   671088640  0x28000000
  41    0x00000029   687865856  0x29000000
  42    0x0000002a   704643072  0x2a000000
  43    0x0000002b   721420288  0x2b000000
  44    0x0000002c   738197504  0x2c000000
  45    0x0000002d   754974720  0x2d000000
  46    0x0000002e   771751936  0x2e000000
  47    0x0000002f   788529152  0x2f000000
  48    0x00000030   805306368  0x30000000
  49    0x00000031   822083584  0x31000000