使用 numpy.fromfile 读取文件时如何跳过行?
How to skip lines when reading a file with numpy.fromfile?
我正在阅读一个 .pksc
文件,其中包含大量天体的坐标和速度。我正在阅读
import numpy as np
f=open('halos_10x10.pksc')
data = np.fromfile(f,count=N*10,dtype=np.float32)
可以找到文件here。它非常大,我想跳过前 m
个对象(如果文件中有行,则与这些对象对应的前 m
行)。我该怎么做,我看不到跳过的选项?可选地,能够从文件中跳过最后的 k
个对象也很好。谢谢!
首先要感谢您的 PKSC 文件是二进制的,并且是一个连续的字节串,数据中没有明显的中断。
另一方面,文本文件的行由一些 line-break 字符明确分隔,因此很容易阅读 a line一次并忽略前面的M行,然后读取你关心的剩余行数:REMAINING_LINES = ALL_LINES - M_LINES - K_LINES
.
np.fromfile()
一次读取二进制文件一个项.
为此,它需要 dtype=
参数来告诉 reader 项目有多大。对于 PKSC 文件,我们将项目表示为 32 位整数,np.int32
.
我搜索了又搜索,但找不到该文件的规范。幸运的是,您提供的 link 有一个示例 Python 脚本用于读取文件;我还找到了一个 well-documented Python 库来处理这些类型的文件(websk.py,linked below)。
我了解到 PKSC 文件具有以下属性:
- 前 3 项是 header 项:
- 第一个 header 项是 条记录 的相关数据的计数 header 项
- 每条相关数据记录包含 10 项
np.fromfile()
也将 count=
参数作为要读取多少项目的指令。
以下是读取 3 header 项、获取后面的 Halo 记录总数以及读取前两条记录(每条记录 10 项)的方法:
Nitems_per_record = 10
f = open('halos_10x10.pksc')
headers = np.fromfile(f, dtype=np.int32, count=3)
print(f'Headers: {headers}')
print(f'This file contains {headers[0]} records with Halo data')
record1 = np.fromfile(f, dtype=np.int32, count=Nitems_per_record)
print(f'First record:\n{record1}')
record2 = np.fromfile(f, dtype=np.int32, count=Nitems_per_record)
print(f'Second record:\n{record2}')
Headers: [2079516 2079516 2079516]
This file contains 2079516 records with Halo data
First record:
[ 1170060708 -1011158654 -1006515961 -1022926100 1121164875 1110446585 1086444250 1170064687 -1011110709 -1006510502]
Second record:
[ 1170083367 -1013908122 -1006498824 -1014626384 -1020456945 -1033004197 1084104229 1170090354 -1013985376 -1006510502]
根据websky.py,第2项和第3项header也有相关的值,也许你也关心这些?我从该代码中合成了以下内容:
RTHMAXin = headers[1]
redshiftbox = headers[2]
一次读取多个记录需要re-shaping数据。要读取 3 条记录:
f = open('halos_10x10.pksc')
np.fromfile(f, dtype=np.int32, count=3) # reading, but ignoring header items
three_records = np.fromfile(f, dtype=np.int32, count=3*Nitems_per_record)
print(f'Initial:\n{three_records}')
reshaped_records = np.reshape(three_records, (3, Nitems_per_record))
print(f'Re-shaped:\n{reshaped}')
Initial:
[ 1170060708 -1011158654 -1006515961 -1022926100 1121164875 1110446585
1086444250 1170064687 -1011110709 -1006510502 1170083367 -1013908122
-1006498824 -1014626384 -1020456945 -1033004197 1084104229 1170090354
-1013985376 -1006510502 1169622353 -1009409432 -1006678295 -1045415727
-1017794908 -1051267742 1084874393 1169623221 -1009509109 -1006675510]
Re-shaped:
[[ 1170060708 -1011158654 -1006515961 -1022926100 1121164875 1110446585 1086444250 1170064687 -1011110709 -1006510502]
[ 1170083367 -1013908122 -1006498824 -1014626384 -1020456945 -1033004197 1084104229 1170090354 -1013985376 -1006510502]
[ 1169622353 -1009409432 -1006678295 -1045415727 -1017794908 -1051267742 1084874393 1169623221 -1009509109 -1006675510]]
那么,跳过呢?
只是 trim 重塑后的数据
最简单的做法是读取所有数据,然后 trim 从正面和背面读取您不想要的数据:
m = 1
k = 1 * -1
trimmed_records = reshaped_records[m:k]
print(f'Trimmed:\n{trimmed_records}')
Trimmed:
[[ 1170083367 -1013908122 -1006498824 -1014626384 -1020456945 -1033004197 1084104229 1170090354 -1013985376 -1006510502]]
我不确定你为什么要跳过,但这是最容易理解和实施的。
如果您的记忆力不足,请继续阅读。
丢弃M条记录,少读K+M
条记录
在我看来,下一个选择是:
- 从第一个 header
中获取记录数(A
条记录)
- 阅读并忽略
M
记录
- 计算出你还需要阅读多少条条记录,前提是你已经阅读了
M
条记录并且想在记录K
处停下来: R = A - M - K
忽略 M
条记录只会节省一点内存;数据仍在读取和解释。如果不读取记录 K
到最后,你肯定会节省内存:
f = open('halos_10x10.pksc')
headers = np.fromfile(f, dtype=np.int32, count=3)
Arecords = headers[0]
Mrecords = 1_000_000
Krecords = 1_000_000
Nitems = Mrecords * Nitems_per_record
np.fromfile(f, dtype=np.int32, count=Nitems)
Rrecords = Arecords - Mrecords - Krecords # Remaining records to read
Nitems = Rrecords * Nitems_per_record
data = np.fromfile(f, dtype=np.int32, count=Nitems)
data = np.reshape(data, (Rrecords, Nitems_per_record))
print(f'From {Arecords} to {Rrecords} records:\n{data.shape}')
From 2079516 to 79516 records:
(79516, 10)
如果您只需要将大文件分成小文件,以便您可以独立操作它们:
import numpy as np
Nrecords_per_chunk = 100_000
Nitems_per_record = 10
f_in = open('halos_10x10.pksc', 'rb')
headers = np.fromfile(f_in, dtype=np.int32, count=3)
Nitems = Nrecords_per_chunk * Nitems_per_record
fnumber = 1
while True:
items = np.fromfile(f_in, dtype=np.int32, count=Nitems)
# Because at the end of the file, we're very likely to get less back than we asked for
Nrecords_read = int(items.shape[0] / Nitems_per_record)
# At End Of File: Weird luck, chunk_size was a perfect multiple of number of records
if Nrecords_read == 0:
break
records = np.reshape(items, (Nrecords_read, Nitems_per_record))
with open(f'halos_{fnumber}.pksc', 'wb') as f_out:
# Keep same format by having 3 "header" items, each item's value is the record count
new_headers = np.array([Nrecords_read]*3, dtype=np.int32)
new_headers.tofile(f_out)
records.tofile(f_out)
# At End Of File
if Nrecords_read < Nrecords_per_chunk:
break
fnumber += 1
f_in.close()
# Test that first 100_000 records from the main file match the records from the first chunked file
f_in = open('halos_10x10.pksc')
np.fromfile(f_in, dtype=np.int32, count=3)
Nitems = Nrecords_per_chunk * Nitems_per_record
items = np.fromfile(f_in, dtype=np.int32, count=Nitems)
records_orig = np.reshape(items, (Nrecords_per_chunk, Nitems_per_record))
f_in.close()
f_in = open('halos_1.pksc')
np.fromfile(f_in, dtype=np.int32, count=3)
Nitems = Nrecords_per_chunk * Nitems_per_record
items = np.fromfile(f_in, dtype=np.int32, count=Nitems)
records_chunked = np.reshape(items, (Nrecords_per_chunk, Nitems_per_record))
f_in.close()
assert np.array_equal(records_orig, records_chunked)
我正在阅读一个 .pksc
文件,其中包含大量天体的坐标和速度。我正在阅读
import numpy as np
f=open('halos_10x10.pksc')
data = np.fromfile(f,count=N*10,dtype=np.float32)
可以找到文件here。它非常大,我想跳过前 m
个对象(如果文件中有行,则与这些对象对应的前 m
行)。我该怎么做,我看不到跳过的选项?可选地,能够从文件中跳过最后的 k
个对象也很好。谢谢!
首先要感谢您的 PKSC 文件是二进制的,并且是一个连续的字节串,数据中没有明显的中断。
另一方面,文本文件的行由一些 line-break 字符明确分隔,因此很容易阅读 a line一次并忽略前面的M行,然后读取你关心的剩余行数:REMAINING_LINES = ALL_LINES - M_LINES - K_LINES
.
np.fromfile()
一次读取二进制文件一个项.
为此,它需要 dtype=
参数来告诉 reader 项目有多大。对于 PKSC 文件,我们将项目表示为 32 位整数,np.int32
.
我搜索了又搜索,但找不到该文件的规范。幸运的是,您提供的 link 有一个示例 Python 脚本用于读取文件;我还找到了一个 well-documented Python 库来处理这些类型的文件(websk.py,linked below)。
我了解到 PKSC 文件具有以下属性:
- 前 3 项是 header 项:
- 第一个 header 项是 条记录 的相关数据的计数 header 项
- 每条相关数据记录包含 10 项
np.fromfile()
也将 count=
参数作为要读取多少项目的指令。
以下是读取 3 header 项、获取后面的 Halo 记录总数以及读取前两条记录(每条记录 10 项)的方法:
Nitems_per_record = 10
f = open('halos_10x10.pksc')
headers = np.fromfile(f, dtype=np.int32, count=3)
print(f'Headers: {headers}')
print(f'This file contains {headers[0]} records with Halo data')
record1 = np.fromfile(f, dtype=np.int32, count=Nitems_per_record)
print(f'First record:\n{record1}')
record2 = np.fromfile(f, dtype=np.int32, count=Nitems_per_record)
print(f'Second record:\n{record2}')
Headers: [2079516 2079516 2079516]
This file contains 2079516 records with Halo data
First record:
[ 1170060708 -1011158654 -1006515961 -1022926100 1121164875 1110446585 1086444250 1170064687 -1011110709 -1006510502]
Second record:
[ 1170083367 -1013908122 -1006498824 -1014626384 -1020456945 -1033004197 1084104229 1170090354 -1013985376 -1006510502]
根据websky.py,第2项和第3项header也有相关的值,也许你也关心这些?我从该代码中合成了以下内容:
RTHMAXin = headers[1]
redshiftbox = headers[2]
一次读取多个记录需要re-shaping数据。要读取 3 条记录:
f = open('halos_10x10.pksc')
np.fromfile(f, dtype=np.int32, count=3) # reading, but ignoring header items
three_records = np.fromfile(f, dtype=np.int32, count=3*Nitems_per_record)
print(f'Initial:\n{three_records}')
reshaped_records = np.reshape(three_records, (3, Nitems_per_record))
print(f'Re-shaped:\n{reshaped}')
Initial:
[ 1170060708 -1011158654 -1006515961 -1022926100 1121164875 1110446585
1086444250 1170064687 -1011110709 -1006510502 1170083367 -1013908122
-1006498824 -1014626384 -1020456945 -1033004197 1084104229 1170090354
-1013985376 -1006510502 1169622353 -1009409432 -1006678295 -1045415727
-1017794908 -1051267742 1084874393 1169623221 -1009509109 -1006675510]
Re-shaped:
[[ 1170060708 -1011158654 -1006515961 -1022926100 1121164875 1110446585 1086444250 1170064687 -1011110709 -1006510502]
[ 1170083367 -1013908122 -1006498824 -1014626384 -1020456945 -1033004197 1084104229 1170090354 -1013985376 -1006510502]
[ 1169622353 -1009409432 -1006678295 -1045415727 -1017794908 -1051267742 1084874393 1169623221 -1009509109 -1006675510]]
那么,跳过呢?
只是 trim 重塑后的数据
最简单的做法是读取所有数据,然后 trim 从正面和背面读取您不想要的数据:
m = 1
k = 1 * -1
trimmed_records = reshaped_records[m:k]
print(f'Trimmed:\n{trimmed_records}')
Trimmed:
[[ 1170083367 -1013908122 -1006498824 -1014626384 -1020456945 -1033004197 1084104229 1170090354 -1013985376 -1006510502]]
我不确定你为什么要跳过,但这是最容易理解和实施的。
如果您的记忆力不足,请继续阅读。
丢弃M条记录,少读K+M
条记录
在我看来,下一个选择是:
- 从第一个 header 中获取记录数(
- 阅读并忽略
M
记录 - 计算出你还需要阅读多少条条记录,前提是你已经阅读了
M
条记录并且想在记录K
处停下来:R = A - M - K
A
条记录)
忽略 M
条记录只会节省一点内存;数据仍在读取和解释。如果不读取记录 K
到最后,你肯定会节省内存:
f = open('halos_10x10.pksc')
headers = np.fromfile(f, dtype=np.int32, count=3)
Arecords = headers[0]
Mrecords = 1_000_000
Krecords = 1_000_000
Nitems = Mrecords * Nitems_per_record
np.fromfile(f, dtype=np.int32, count=Nitems)
Rrecords = Arecords - Mrecords - Krecords # Remaining records to read
Nitems = Rrecords * Nitems_per_record
data = np.fromfile(f, dtype=np.int32, count=Nitems)
data = np.reshape(data, (Rrecords, Nitems_per_record))
print(f'From {Arecords} to {Rrecords} records:\n{data.shape}')
From 2079516 to 79516 records:
(79516, 10)
如果您只需要将大文件分成小文件,以便您可以独立操作它们:
import numpy as np
Nrecords_per_chunk = 100_000
Nitems_per_record = 10
f_in = open('halos_10x10.pksc', 'rb')
headers = np.fromfile(f_in, dtype=np.int32, count=3)
Nitems = Nrecords_per_chunk * Nitems_per_record
fnumber = 1
while True:
items = np.fromfile(f_in, dtype=np.int32, count=Nitems)
# Because at the end of the file, we're very likely to get less back than we asked for
Nrecords_read = int(items.shape[0] / Nitems_per_record)
# At End Of File: Weird luck, chunk_size was a perfect multiple of number of records
if Nrecords_read == 0:
break
records = np.reshape(items, (Nrecords_read, Nitems_per_record))
with open(f'halos_{fnumber}.pksc', 'wb') as f_out:
# Keep same format by having 3 "header" items, each item's value is the record count
new_headers = np.array([Nrecords_read]*3, dtype=np.int32)
new_headers.tofile(f_out)
records.tofile(f_out)
# At End Of File
if Nrecords_read < Nrecords_per_chunk:
break
fnumber += 1
f_in.close()
# Test that first 100_000 records from the main file match the records from the first chunked file
f_in = open('halos_10x10.pksc')
np.fromfile(f_in, dtype=np.int32, count=3)
Nitems = Nrecords_per_chunk * Nitems_per_record
items = np.fromfile(f_in, dtype=np.int32, count=Nitems)
records_orig = np.reshape(items, (Nrecords_per_chunk, Nitems_per_record))
f_in.close()
f_in = open('halos_1.pksc')
np.fromfile(f_in, dtype=np.int32, count=3)
Nitems = Nrecords_per_chunk * Nitems_per_record
items = np.fromfile(f_in, dtype=np.int32, count=Nitems)
records_chunked = np.reshape(items, (Nrecords_per_chunk, Nitems_per_record))
f_in.close()
assert np.array_equal(records_orig, records_chunked)