将 python 个对象写入磁盘而不加载到内存中?
writing python objects to disk without loading into memory?
我正在 运行 进行大量计算,我想一次将其结果保存到磁盘一项,因为整个数据太大而无法保存在内存中。我尝试使用 shelve
保存它,但出现错误:
HASH: Out of overflow pages. Increase page size
我的代码如下。在 python 中执行此操作的正确方法是什么? pickle
将对象加载到内存中。 shelve
支持磁盘写入,但强制使用受键数限制的字典结构。我保存的最终数据只是一个列表,不需要是字典形式。只需要能够一次阅读一项。
import shelve
def my_data():
# this is a generator that yields data points
for n in xrange(very_large_number):
yield data_point
def save_result():
db = shelve.open("result")
n = 0
for data in my_data():
# result is a Python object (a tuple)
result = compute(data)
# now save result to disk
db[str(n)] = result
db.close()
以下程序演示了您可能希望如何执行您在问题中描述的过程。它模拟您的应用程序可能需要复制的数据的创建、写入、读取和处理。在默认形式下,代码生成大约 32 GB 的数据并将其写入磁盘。经过一些试验后,启用 gzip
压缩可提供良好的速度并将文件大小减小到大约 195 MB。您应该针对您的问题调整示例,并可能会通过反复试验找到比其他压缩技术更适合的不同压缩技术。
#! /usr/bin/env python3
import os
import pickle
# Uncomment one of these imports to enable file compression:
# from bz2 import open
# from gzip import open
# from lzma import open
DATA_FILE = 'results.dat'
KB = 1 << 10
MB = 1 << 20
GB = 1 << 30
TB = 1 << 40
def main():
"""Demonstrate saving data to and loading data from a file."""
save_data(develop_data())
analyze_data(load_data())
def develop_data():
"""Create some sample data that can be saved for later processing."""
return (os.urandom(1 * KB) * (1 * MB // KB) for _ in range(32 * GB // MB))
def save_data(data):
"""Take in all data and save it for retrieval later on."""
with open(DATA_FILE, 'wb') as file:
for obj in data:
pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)
def load_data():
"""Load each item that was previously written to disk."""
with open(DATA_FILE, 'rb') as file:
try:
while True:
yield pickle.load(file)
except EOFError:
pass
def analyze_data(data):
"""Pretend to do something useful with each object that was loaded."""
for obj in data:
print(hash(obj))
if __name__ == '__main__':
main()
如果使用 klepto
就很容易了,它使您能够透明地将对象存储在文件或数据库中。首先,我展示了直接使用存档后端(即直接写入磁盘)。
>>> import klepto
>>> db = klepto.archives.dir_archive('db', serialized=True, cached=False)
>>> db['n'] = 69
>>> db['add'] = lambda x,y: x+y
>>> db['x'] = 42
>>> db['y'] = 11
>>> db['sub'] = lambda x,y: y-x
>>>
然后我们重新启动,创建一个到磁盘上的新连接 "database"。
Python 2.7.11 (default, Dec 5 2015, 23:50:48)
[GCC 4.2.1 Compatible Apple LLVM 5.1 (clang-503.0.40)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import klepto
>>> db = klepto.archives.dir_archive('db', serialized=True, cached=False)
>>> db
dir_archive('db', {'y': 11, 'x': 42, 'add': <function <lambda> at 0x10e500d70>, 'sub': <function <lambda> at 0x10e500de8>, 'n': 69}, cached=False)
>>>
或者您可以创建一个使用内存代理的新连接。下面,我只显示将所需的条目加载到内存中。
Python 2.7.11 (default, Dec 5 2015, 23:50:48)
[GCC 4.2.1 Compatible Apple LLVM 5.1 (clang-503.0.40)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import klepto
>>> db = klepto.archives.dir_archive('db', serialized=True, cached=True)
>>> db
dir_archive('db', {}, cached=True)
>>> db.load('x', 'y') # read multiple
>>> db.load('add') # read one at a time
>>> db
dir_archive('db', {'y': 11, 'x': 42, 'add': <function <lambda> at 0x1079e7d70>}, cached=True)
>>> db['result'] = db['add'](db['x'],db['y'])
>>> db['result']
53
>>>
…或者也可以 dump
磁盘的新条目。
>>> db.dump('result')
>>>
我正在 运行 进行大量计算,我想一次将其结果保存到磁盘一项,因为整个数据太大而无法保存在内存中。我尝试使用 shelve
保存它,但出现错误:
HASH: Out of overflow pages. Increase page size
我的代码如下。在 python 中执行此操作的正确方法是什么? pickle
将对象加载到内存中。 shelve
支持磁盘写入,但强制使用受键数限制的字典结构。我保存的最终数据只是一个列表,不需要是字典形式。只需要能够一次阅读一项。
import shelve
def my_data():
# this is a generator that yields data points
for n in xrange(very_large_number):
yield data_point
def save_result():
db = shelve.open("result")
n = 0
for data in my_data():
# result is a Python object (a tuple)
result = compute(data)
# now save result to disk
db[str(n)] = result
db.close()
以下程序演示了您可能希望如何执行您在问题中描述的过程。它模拟您的应用程序可能需要复制的数据的创建、写入、读取和处理。在默认形式下,代码生成大约 32 GB 的数据并将其写入磁盘。经过一些试验后,启用 gzip
压缩可提供良好的速度并将文件大小减小到大约 195 MB。您应该针对您的问题调整示例,并可能会通过反复试验找到比其他压缩技术更适合的不同压缩技术。
#! /usr/bin/env python3
import os
import pickle
# Uncomment one of these imports to enable file compression:
# from bz2 import open
# from gzip import open
# from lzma import open
DATA_FILE = 'results.dat'
KB = 1 << 10
MB = 1 << 20
GB = 1 << 30
TB = 1 << 40
def main():
"""Demonstrate saving data to and loading data from a file."""
save_data(develop_data())
analyze_data(load_data())
def develop_data():
"""Create some sample data that can be saved for later processing."""
return (os.urandom(1 * KB) * (1 * MB // KB) for _ in range(32 * GB // MB))
def save_data(data):
"""Take in all data and save it for retrieval later on."""
with open(DATA_FILE, 'wb') as file:
for obj in data:
pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)
def load_data():
"""Load each item that was previously written to disk."""
with open(DATA_FILE, 'rb') as file:
try:
while True:
yield pickle.load(file)
except EOFError:
pass
def analyze_data(data):
"""Pretend to do something useful with each object that was loaded."""
for obj in data:
print(hash(obj))
if __name__ == '__main__':
main()
如果使用 klepto
就很容易了,它使您能够透明地将对象存储在文件或数据库中。首先,我展示了直接使用存档后端(即直接写入磁盘)。
>>> import klepto
>>> db = klepto.archives.dir_archive('db', serialized=True, cached=False)
>>> db['n'] = 69
>>> db['add'] = lambda x,y: x+y
>>> db['x'] = 42
>>> db['y'] = 11
>>> db['sub'] = lambda x,y: y-x
>>>
然后我们重新启动,创建一个到磁盘上的新连接 "database"。
Python 2.7.11 (default, Dec 5 2015, 23:50:48)
[GCC 4.2.1 Compatible Apple LLVM 5.1 (clang-503.0.40)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import klepto
>>> db = klepto.archives.dir_archive('db', serialized=True, cached=False)
>>> db
dir_archive('db', {'y': 11, 'x': 42, 'add': <function <lambda> at 0x10e500d70>, 'sub': <function <lambda> at 0x10e500de8>, 'n': 69}, cached=False)
>>>
或者您可以创建一个使用内存代理的新连接。下面,我只显示将所需的条目加载到内存中。
Python 2.7.11 (default, Dec 5 2015, 23:50:48)
[GCC 4.2.1 Compatible Apple LLVM 5.1 (clang-503.0.40)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import klepto
>>> db = klepto.archives.dir_archive('db', serialized=True, cached=True)
>>> db
dir_archive('db', {}, cached=True)
>>> db.load('x', 'y') # read multiple
>>> db.load('add') # read one at a time
>>> db
dir_archive('db', {'y': 11, 'x': 42, 'add': <function <lambda> at 0x1079e7d70>}, cached=True)
>>> db['result'] = db['add'](db['x'],db['y'])
>>> db['result']
53
>>>
…或者也可以 dump
磁盘的新条目。
>>> db.dump('result')
>>>