高效 serialize/deserialize 一个 SparseDataFrame
Efficiently serialize/deserialize a SparseDataFrame
有没有人有效地serialized/deserialized pandas SparseDataFrame?
import pandas as pd
import scipy
from scipy import sparse
dfs = pd.SparseDataFrame(scipy.sparse.random(1000, 1000).toarray())
# just for testing
pickle 不是答案
速度慢得离谱。
import pickle, time
start = time.time()
# serialization
msg = list(pickle.dumps(dfs, protocol=pickle.HIGHEST_PROTOCOL))
# deserialization
dfs = pickle.loads(bytes(msg))
stop = time.time()
stop - start
# 0.4420337677001953
# This is with Python 3.5 so it's using cPickle
相比之下,msgpack 更快密集版本
df = dfs.to_dense()
start = time.time()
# serialization
msg = list(df.to_msgpack(compress='zlib'))
# deserialization
df = pd.read_msgpack(bytes(msg))
stop = time.time()
stop - start
# 0.09514737129211426
msgpack
Msgpack 将是答案,但我找不到 SparseDataFrame (related)
的实现
# serialization
dfs.to_msgpack(compress='zlib')
# Returns: NotImplementedError: msgpack sparse frame is not implemented
坐标格式
通过 scipy.sparse.coo_matrix
坐标格式的 msgpack 似乎值得考虑,但转换为 python.sparse.coo_matrix
很慢
from scipy.sparse import coo_matrix
start = time.time()
# serialization
columns = dfs.columns
shape = dfs.shape
start_to_coo = time.time()
dfc = dfs.to_coo()
stop_to_coo = time.time()
start_comprehension = time.time()
row = [x.item() for x in df.row]
col = [x.item() for x in df.col]
data = [x.item() for x in df.data]
stop_comprehension = time.time()
start_packing = time.time()
msg = list(msgpack.packb({'columns':list(columns), 'shape':shape, 'row':row, 'col':col, 'data':data}))
stop_packing = time.time()
# deserialization
start_unpacking = time.time()
dict = msgpack.unpackb(bytes(msg))
stop_unpacking = time.time()
columns=dict[b'columns']
index=range(dict[b'shape'][0])
dfc = coo_matrix((dict[b'data'], (dict[b'row'], dict[b'col'])), shape=dict[b'shape'])
stop = time.time()
print('total: ' + str(stop - start))
print(' to_coo: ' + str(stop_to_coo - start_to_coo))
print(' comprehension: ' + str(stop_comprehension - start_comprehension))
print(' packing: ' + str(stop_packing - start_packing))
print(' unpacking: ' + str(stop_unpacking - start_unpacking))
#total: 0.2799222469329834
# to_coo: 0.22925591468811035
# comprehension & cast: 0.02356100082397461 (msgpack does not support all numpy formats)
# packing: 0.004893064498901367
# unpacking: 0.001984834671020508
从那里看来,人们需要通过一种密集格式。
start = time.time()
dfs = pd.SparseDataFrame(dfc.toarray())
stop = time.time()
stop - start
# 2.8947737216949463
时间开销源于 dumps
和 loads
中的字符串处理。
使用dumps/loads
:
def pickle_dumps():
msg = list(pickle.dumps(dfs, protocol=pickle.HIGHEST_PROTOCOL))
pickle.loads(bytes(msg))
%timeit pickle_dumps()
# 212 ms ± 2.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
使用dump/load
:
def pickle_file():
with open('dump.pickle', 'wb') as f:
pickle.dump(dfs, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('dump.pickle', 'rb') as f:
return pickle.load(f)
%timeit pickle_file()
# 82.7 ms ± 1.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
或者使用 pandas 内置函数甚至更短:
def to_pickle():
dfs.to_pickle('./dump.pickle')
pd.read_pickle('./dump.pickle')
%timeit to_pickle()
# 86.8 ms ± 1.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
我的测试有问题
dfs = pd.SparseDataFrame(scipy.sparse.random(1000, 1000).toarray())
并没有真正存储稀疏表示。相反
dfs = pd.DataFrame(scipy.sparse.random(1000, 1000).toarray()).to_sparse(fill_value=0)
会。
在此之后,稀疏表示上的 pickle 比密集表示上的 msgpack 表现更好。
另外,我用了df.row
而不是dfc.row
。 df
指向不同的数据帧。 msgpack 可能在缓存中有结果,但什么也没做。
更正此错误后,coo_matrix-based 表示中的 msgpack 并没有比数据帧上的 pickle 有所改善。
有没有人有效地serialized/deserialized pandas SparseDataFrame?
import pandas as pd
import scipy
from scipy import sparse
dfs = pd.SparseDataFrame(scipy.sparse.random(1000, 1000).toarray())
# just for testing
pickle 不是答案
速度慢得离谱。
import pickle, time
start = time.time()
# serialization
msg = list(pickle.dumps(dfs, protocol=pickle.HIGHEST_PROTOCOL))
# deserialization
dfs = pickle.loads(bytes(msg))
stop = time.time()
stop - start
# 0.4420337677001953
# This is with Python 3.5 so it's using cPickle
相比之下,msgpack 更快密集版本
df = dfs.to_dense()
start = time.time()
# serialization
msg = list(df.to_msgpack(compress='zlib'))
# deserialization
df = pd.read_msgpack(bytes(msg))
stop = time.time()
stop - start
# 0.09514737129211426
msgpack
Msgpack 将是答案,但我找不到 SparseDataFrame (related)
的实现# serialization
dfs.to_msgpack(compress='zlib')
# Returns: NotImplementedError: msgpack sparse frame is not implemented
坐标格式
通过 scipy.sparse.coo_matrix
坐标格式的 msgpack 似乎值得考虑,但转换为 python.sparse.coo_matrix
很慢
from scipy.sparse import coo_matrix
start = time.time()
# serialization
columns = dfs.columns
shape = dfs.shape
start_to_coo = time.time()
dfc = dfs.to_coo()
stop_to_coo = time.time()
start_comprehension = time.time()
row = [x.item() for x in df.row]
col = [x.item() for x in df.col]
data = [x.item() for x in df.data]
stop_comprehension = time.time()
start_packing = time.time()
msg = list(msgpack.packb({'columns':list(columns), 'shape':shape, 'row':row, 'col':col, 'data':data}))
stop_packing = time.time()
# deserialization
start_unpacking = time.time()
dict = msgpack.unpackb(bytes(msg))
stop_unpacking = time.time()
columns=dict[b'columns']
index=range(dict[b'shape'][0])
dfc = coo_matrix((dict[b'data'], (dict[b'row'], dict[b'col'])), shape=dict[b'shape'])
stop = time.time()
print('total: ' + str(stop - start))
print(' to_coo: ' + str(stop_to_coo - start_to_coo))
print(' comprehension: ' + str(stop_comprehension - start_comprehension))
print(' packing: ' + str(stop_packing - start_packing))
print(' unpacking: ' + str(stop_unpacking - start_unpacking))
#total: 0.2799222469329834
# to_coo: 0.22925591468811035
# comprehension & cast: 0.02356100082397461 (msgpack does not support all numpy formats)
# packing: 0.004893064498901367
# unpacking: 0.001984834671020508
从那里看来,人们需要通过一种密集格式。
start = time.time()
dfs = pd.SparseDataFrame(dfc.toarray())
stop = time.time()
stop - start
# 2.8947737216949463
时间开销源于 dumps
和 loads
中的字符串处理。
使用dumps/loads
:
def pickle_dumps():
msg = list(pickle.dumps(dfs, protocol=pickle.HIGHEST_PROTOCOL))
pickle.loads(bytes(msg))
%timeit pickle_dumps()
# 212 ms ± 2.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
使用dump/load
:
def pickle_file():
with open('dump.pickle', 'wb') as f:
pickle.dump(dfs, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('dump.pickle', 'rb') as f:
return pickle.load(f)
%timeit pickle_file()
# 82.7 ms ± 1.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
或者使用 pandas 内置函数甚至更短:
def to_pickle():
dfs.to_pickle('./dump.pickle')
pd.read_pickle('./dump.pickle')
%timeit to_pickle()
# 86.8 ms ± 1.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
我的测试有问题
dfs = pd.SparseDataFrame(scipy.sparse.random(1000, 1000).toarray())
并没有真正存储稀疏表示。相反
dfs = pd.DataFrame(scipy.sparse.random(1000, 1000).toarray()).to_sparse(fill_value=0)
会。
在此之后,稀疏表示上的 pickle 比密集表示上的 msgpack 表现更好。
另外,我用了df.row
而不是dfc.row
。 df
指向不同的数据帧。 msgpack 可能在缓存中有结果,但什么也没做。
更正此错误后,coo_matrix-based 表示中的 msgpack 并没有比数据帧上的 pickle 有所改善。