酸洗大型 NumPy 数组
Pickling large NumPy array
我有一个大型 3d numpy 数组,我想保留它。我的第一种方法是简单地使用泡菜,但这似乎会导致解释不当的错误。
test_rand = np.random.random((100000,200,50))
with open('models/test.pkl', 'wb') as save_file:
pickle.dump(test_rand, save_file, -1)
---------------------------------------------------------------------------
error Traceback (most recent call last)
<ipython-input-18-511e30b08440> in <module>()
1 with open('models/test.pkl', 'wb') as save_file:
----> 2 pickle.dump(test_rand, save_file, -1)
3
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in dump(obj, file, protocol)
1368
1369 def dump(obj, file, protocol=None):
-> 1370 Pickler(file, protocol).dump(obj)
1371
1372 def dumps(obj, protocol=None):
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in dump(self, obj)
222 if self.proto >= 2:
223 self.write(PROTO + chr(self.proto))
--> 224 self.save(obj)
225 self.write(STOP)
226
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save(self, obj)
329
330 # Save the reduce() output and finally memoize the object
--> 331 self.save_reduce(obj=obj, *rv)
332
333 def persistent_id(self, obj):
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save_reduce(self, func, args, state, listitems, dictitems, obj)
417
418 if state is not None:
--> 419 save(state)
420 write(BUILD)
421
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save(self, obj)
284 f = self.dispatch.get(t)
285 if f:
--> 286 f(self, obj) # Call unbound method with explicit self
287 return
288
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save_tuple(self, obj)
560 write(MARK)
561 for element in obj:
--> 562 save(element)
563
564 if id(obj) in memo:
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save(self, obj)
284 f = self.dispatch.get(t)
285 if f:
--> 286 f(self, obj) # Call unbound method with explicit self
287 return
288
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save_string(self, obj, pack)
484 self.write(SHORT_BINSTRING + chr(n) + obj)
485 else:
--> 486 self.write(BINSTRING + pack("<i", n) + obj)
487 else:
488 self.write(STRING + repr(obj) + '\n')
error: integer out of range for 'i' format code
所以我的两个问题如下:
- 这个错误到底是怎么回事?
- 我应该如何将阵列保存到磁盘?
我正在使用 Python 2.7.8 和 NumPy 1.9.0。
回答第一个问题,"What is actually going on in this error?",这是我的猜测。
Pickle 正在尝试将您的 NumPy 数组保存为打包的二进制数据。它将每个整数保存为四字节有符号整数(i
code). However, numpy.random.random
创建浮点数(应该是八字节 d
s 而不是四字节 i
s)。我不知道为什么 pickle 会这样做。i
实际上也完全有可能是为了保存一些其他信息而不是数组的值之一。我只是猜测错误的产生是因为值你的数组不适合四个字节。
您使用的 Python 和 NumPy 是什么版本?
作为 pickle
的替代方案,特别是对于非常大的数据集,您可能希望考虑 Python 二进制数据格式的接口,例如 HDF5(例如,h5py). For a discussion of its pros and cons, see this question 和第一个回答。
关于 #1,这是一个错误……而且是一个旧错误。这里有一个很有启发性的讨论,尽管出人意料地古老:http://python.6.x6.nabble.com/test-gzip-test-tarfile-failure-om-AMD64-td1830323.html
错误原因在这里:http://www.littleredbat.net/mk/files/grimoire.html#contents_item_2.1
The simplest and most basic type are integers, which are represented
as a C long. Their size is therefore dependent on the platform you're
using; on a 32-bit machine, they can range from -2147483647 to
2147483647. Python programs can determine the highest possible value for an integer by looking at sys.maxint; the lowest possible value
will usually be -sys.maxint - 1.
这个错误并不常见,因为大多数人在面对非常大的 numpy
数组时,会使用 np.save
或 np.savez
来利用简化的 pickle 格式对于 numpy
数组(请参阅 numpy
数组的 __reduce__
方法,这是 np.save
在幕后调用的方法)。
为了表明它只是数组对于 pickle
来说太大了…
>>> import numpy as np
>>> import pickle
>>> test_rand = np.random.random((100000,200,50))
>>> x = pickle.dumps(test_rand[:20000], -1)
>>> x = pickle.dumps(test_rand[:30000], -1)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/mmckerns/lib/python2.7/site-packages/dill-0.2.3.dev0-py2.7.egg/dill/dill.py", line 194, in dumps
dump(obj, file, protocol, byref, fmode)#, strictio)
File "/Users/mmckerns/lib/python2.7/site-packages/dill-0.2.3.dev0-py2.7.egg/dill/dill.py", line 184, in dump
pik.dump(obj)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 224, in dump
self.save(obj)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/Users/mmckerns/lib/python2.7/site-packages/dill-0.2.3.dev0-py2.7.egg/dill/dill.py", line 181, in save_numpy_array
pik.save_reduce(_create_array, (f, args, state, npdict), obj=obj)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 401, in save_reduce
save(args)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 562, in save_tuple
save(element)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 562, in save_tuple
save(element)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 486, in save_string
self.write(BINSTRING + pack("<i", n) + obj)
struct.error: 'i' format requires -2147483648 <= number <= 2147483647
>>>
但是,这适用于整个阵列...
>>> x = test_rand.__reduce__()
>>> type(x)
<type 'tuple'>
>>> x[0]
<built-in function _reconstruct>
>>> x[1]
(<type 'numpy.ndarray'>, (0,), 'b')
>>> x[2][0:3]
(1, (100000, 200, 50), dtype('float64'))
>>> len(x[2][4])
8000000000
>>> x[2][4][:100]
'Y\xa4}\xdf\x84\xdf\xe1?\xfe\x1fd\xe3\xf2\xab\xe2?\x80\xe4\xfe\x17\xfb\xd6\xc2?\xd73\x92\xc9N]\xe8?\x90\xbc\xe3@\xdcO\xc9?\x18\x9dX\x12MG\xc4?(\x0f\x8f\xf9}\xf6\xb1?\xd0\x90O\xe2\x9b\xf1\xed?_\x99\x06\xacY\x9e\xe2?\xe7\xf8\x15\xa8\x13\x91\xe2?\x96}\xffH\xda\xc3\xd4?@\t\xae_"\xe0\xda?y<%\x8a'
如果你想烧坏你的风扇,print x
。
您还会注意到 x[0]
中的函数与数据一起保存。这是一个独立的函数,可以从 pickle 数据中生成一个 numpy 数组。
我有一个大型 3d numpy 数组,我想保留它。我的第一种方法是简单地使用泡菜,但这似乎会导致解释不当的错误。
test_rand = np.random.random((100000,200,50))
with open('models/test.pkl', 'wb') as save_file:
pickle.dump(test_rand, save_file, -1)
---------------------------------------------------------------------------
error Traceback (most recent call last)
<ipython-input-18-511e30b08440> in <module>()
1 with open('models/test.pkl', 'wb') as save_file:
----> 2 pickle.dump(test_rand, save_file, -1)
3
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in dump(obj, file, protocol)
1368
1369 def dump(obj, file, protocol=None):
-> 1370 Pickler(file, protocol).dump(obj)
1371
1372 def dumps(obj, protocol=None):
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in dump(self, obj)
222 if self.proto >= 2:
223 self.write(PROTO + chr(self.proto))
--> 224 self.save(obj)
225 self.write(STOP)
226
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save(self, obj)
329
330 # Save the reduce() output and finally memoize the object
--> 331 self.save_reduce(obj=obj, *rv)
332
333 def persistent_id(self, obj):
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save_reduce(self, func, args, state, listitems, dictitems, obj)
417
418 if state is not None:
--> 419 save(state)
420 write(BUILD)
421
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save(self, obj)
284 f = self.dispatch.get(t)
285 if f:
--> 286 f(self, obj) # Call unbound method with explicit self
287 return
288
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save_tuple(self, obj)
560 write(MARK)
561 for element in obj:
--> 562 save(element)
563
564 if id(obj) in memo:
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save(self, obj)
284 f = self.dispatch.get(t)
285 if f:
--> 286 f(self, obj) # Call unbound method with explicit self
287 return
288
C:\Users\g1dak02\AppData\Local\Continuum\Anaconda\lib\pickle.pyc in save_string(self, obj, pack)
484 self.write(SHORT_BINSTRING + chr(n) + obj)
485 else:
--> 486 self.write(BINSTRING + pack("<i", n) + obj)
487 else:
488 self.write(STRING + repr(obj) + '\n')
error: integer out of range for 'i' format code
所以我的两个问题如下:
- 这个错误到底是怎么回事?
- 我应该如何将阵列保存到磁盘?
我正在使用 Python 2.7.8 和 NumPy 1.9.0。
回答第一个问题,"What is actually going on in this error?",这是我的猜测。
Pickle 正在尝试将您的 NumPy 数组保存为打包的二进制数据。它将每个整数保存为四字节有符号整数(i
code). However, numpy.random.random
创建浮点数(应该是八字节 d
s 而不是四字节 i
s)。我不知道为什么 pickle 会这样做。i
实际上也完全有可能是为了保存一些其他信息而不是数组的值之一。我只是猜测错误的产生是因为值你的数组不适合四个字节。
您使用的 Python 和 NumPy 是什么版本?
作为 pickle
的替代方案,特别是对于非常大的数据集,您可能希望考虑 Python 二进制数据格式的接口,例如 HDF5(例如,h5py). For a discussion of its pros and cons, see this question 和第一个回答。
关于 #1,这是一个错误……而且是一个旧错误。这里有一个很有启发性的讨论,尽管出人意料地古老:http://python.6.x6.nabble.com/test-gzip-test-tarfile-failure-om-AMD64-td1830323.html
错误原因在这里:http://www.littleredbat.net/mk/files/grimoire.html#contents_item_2.1
The simplest and most basic type are integers, which are represented as a C long. Their size is therefore dependent on the platform you're using; on a 32-bit machine, they can range from -2147483647 to 2147483647. Python programs can determine the highest possible value for an integer by looking at sys.maxint; the lowest possible value will usually be -sys.maxint - 1.
这个错误并不常见,因为大多数人在面对非常大的 numpy
数组时,会使用 np.save
或 np.savez
来利用简化的 pickle 格式对于 numpy
数组(请参阅 numpy
数组的 __reduce__
方法,这是 np.save
在幕后调用的方法)。
为了表明它只是数组对于 pickle
来说太大了…
>>> import numpy as np
>>> import pickle
>>> test_rand = np.random.random((100000,200,50))
>>> x = pickle.dumps(test_rand[:20000], -1)
>>> x = pickle.dumps(test_rand[:30000], -1)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/mmckerns/lib/python2.7/site-packages/dill-0.2.3.dev0-py2.7.egg/dill/dill.py", line 194, in dumps
dump(obj, file, protocol, byref, fmode)#, strictio)
File "/Users/mmckerns/lib/python2.7/site-packages/dill-0.2.3.dev0-py2.7.egg/dill/dill.py", line 184, in dump
pik.dump(obj)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 224, in dump
self.save(obj)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/Users/mmckerns/lib/python2.7/site-packages/dill-0.2.3.dev0-py2.7.egg/dill/dill.py", line 181, in save_numpy_array
pik.save_reduce(_create_array, (f, args, state, npdict), obj=obj)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 401, in save_reduce
save(args)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 562, in save_tuple
save(element)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 562, in save_tuple
save(element)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.py", line 486, in save_string
self.write(BINSTRING + pack("<i", n) + obj)
struct.error: 'i' format requires -2147483648 <= number <= 2147483647
>>>
但是,这适用于整个阵列...
>>> x = test_rand.__reduce__()
>>> type(x)
<type 'tuple'>
>>> x[0]
<built-in function _reconstruct>
>>> x[1]
(<type 'numpy.ndarray'>, (0,), 'b')
>>> x[2][0:3]
(1, (100000, 200, 50), dtype('float64'))
>>> len(x[2][4])
8000000000
>>> x[2][4][:100]
'Y\xa4}\xdf\x84\xdf\xe1?\xfe\x1fd\xe3\xf2\xab\xe2?\x80\xe4\xfe\x17\xfb\xd6\xc2?\xd73\x92\xc9N]\xe8?\x90\xbc\xe3@\xdcO\xc9?\x18\x9dX\x12MG\xc4?(\x0f\x8f\xf9}\xf6\xb1?\xd0\x90O\xe2\x9b\xf1\xed?_\x99\x06\xacY\x9e\xe2?\xe7\xf8\x15\xa8\x13\x91\xe2?\x96}\xffH\xda\xc3\xd4?@\t\xae_"\xe0\xda?y<%\x8a'
如果你想烧坏你的风扇,print x
。
您还会注意到 x[0]
中的函数与数据一起保存。这是一个独立的函数,可以从 pickle 数据中生成一个 numpy 数组。