使用numpy加载和存储稀疏数据的性能
Performance of loading and storing sparse data with numpy
我想到了自定义稀疏数据格式。它用于 space 高效存储和加载,而不是对其进行计算。本质是存储非零条目的索引和值。我想知道是否有一些可以提高性能的调整。
需要像这样处理数据:
N "images" (32x32) 每个有四个通道。这些图像平均包含约 5% 的非零值。随着 N 变得非常大,将所有图像存储在 RAM 中效率低下。所以只存储非零条目的数量、它们的索引和值以及原始形状。
这是一个如何实现的示例:
import numpy as np
def disassemble_data(data):
# get some dense data and make it sparse
lengths = np.count_nonzero(data, axis=(1, 2, 3))
idxs = np.flatnonzero(data)
vals = data.ravel()[idxs]
return lengths, idxs, vals, data.shape
def assemble_data(lengths, idxs, vals, shape):
# get some sparse data and make it dense
data = np.zeros(shape)
lower_idx = 0
for length in lengths:
upper_idx = lower_idx + length
data.ravel()[idxs[lower_idx:upper_idx]] = vals[lower_idx:upper_idx]
lower_idx = upper_idx
return data
# Create some dummy data
my_data = np.random.uniform(0, 1, (10, 4, 32, 32))
my_data[my_data > 0.05] = 0
# make data sparse and then dense again
my_new_data = assemble_data(*disassemble_data(my_data))
# assert that this actually works
assert np.allclose(my_data, my_new_data)
现在,我们可以直接看到优势:数据是逐张加密的。这允许我们将整个数据集加载到 RAM 中并通过生成器按需生成密集图像:
def image_generator(lengths, idxs, vals, shape):
idxs %= np.prod(shape[1:])
lower_idx = 0
for length in lengths:
upper_idx = lower_idx + length
data = np.zeros(shape[1:])
data.ravel()[idxs[lower_idx:upper_idx]] = vals[lower_idx:upper_idx]
lower_idx = upper_idx
yield data
此外,还可以生成批量图像:
def image_batch_generator(lengths, idxs, vals, shape, batch_size):
idxs %= np.prod((batch_size, *shape[1:]))
lengths = np.sum(lengths.reshape(-1, batch_size), axis=1)
lower_idx = 0
for length in lengths:
upper_idx = lower_idx + length
data = np.zeros((batch_size, *shape[1:]))
data.ravel()[idxs[lower_idx:upper_idx]] = vals[lower_idx:upper_idx]
lower_idx = upper_idx
yield data
这是满足我的需要的一种相当方便的方法。但是我想知道是否有可能加快速度。
例如我看到 numpys 项集比直接分配更快(根据 docs)。但它只适用于单个项目,不适用于索引数组。
还有其他方法吗?我对 cython 等一点都不熟悉,所以我很乐意得到一些提示!
我测试了一些如何更有效地完成此操作,并得出结论,对于 np.random.uniform 生成的高度不相关的数据,您的方法一点也不差。在真实数据上,这可能有点不同。
我提高了你的功能的速度,压缩速度大约为 1.4GB/s,解压速度为 1.2GB/s,这一点也不差。使用 h5py (blosclz) 我只能达到大约 450MB/s,但也将数据写入磁盘。
改进的稀疏算法
import numpy as np
import numba as nb
#We can use uint16 on (4,32,32), since max. idx<2**16
@nb.jit()
def to_sparse_data_uint16(data):
data_flat=data.reshape(-1)
idx=np.empty(data.size,dtype=np.uint16)
data_out=np.empty(data.size,dtype=data.dtype)
ii=0
for i in range(data_flat.shape[0]):
if (data_flat[i]!=0):
idx[ii]=i
data_out[ii]=data_flat[i]
ii+=1
return idx[0:ii], data_out[0:ii], data.shape
def to_dense_data(idx,data,shape):
length=np.prod(shape)
data_out=np.zeros(length,dtype=data.dtype)
data_out[idx]=data
return data_out.reshape(shape)
########################
#do you really need float64 here?
images = np.array(np.random.uniform(0, 1, (100000, 4, 32, 32)),dtype=np.float32)
images[images > 0.05] = 0.
res=[]
t1=time.time()
for i in range(100000):
res.append(to_sparse_data_uint16(images[i,:,:,:]))
print(time.time()-t1)
t1=time.time()
for i in range(100000):
data=to_dense_data(res[i][0],res[i][1],res[i][2])
print(time.time()-t1)
HDF5 示例
import numpy as np
import tables #register blosc
import h5py as h5
import h5py_cache as h5c
import time
# I assume here that you don't need float64 for images..
# 1650MB Testdata
images = np.array(np.random.uniform(0, 1, (100000, 4, 32, 32)),dtype=np.float32)
images[images > 0.05] = 0.
#Write data (32,7 GB uncompressed)
hdf5_path='Test.h5'
f = h5c.File(hdf5_path, 'w',chunk_cache_mem_size=1024**2*100) #200 MB cache size
dset_images = f.create_dataset("images", shape=(20*100000, 4, 32, 32),dtype=np.float32,chunks=(1000, 4, 32, 32),compression=32001,compression_opts=(0, 0, 0, 0, 9, 1, 1), shuffle=False)
t1=time.time()
#Don't call h5py to often, this will lead to bad performance
for i in range(20):
dset_images[i*100000:(i+1)*100000,:,:,:]=images
f.close()
print(time.time()-t1)
print("MB/s: " + str(32700/(time.time()-t1)))
我想到了自定义稀疏数据格式。它用于 space 高效存储和加载,而不是对其进行计算。本质是存储非零条目的索引和值。我想知道是否有一些可以提高性能的调整。
需要像这样处理数据: N "images" (32x32) 每个有四个通道。这些图像平均包含约 5% 的非零值。随着 N 变得非常大,将所有图像存储在 RAM 中效率低下。所以只存储非零条目的数量、它们的索引和值以及原始形状。
这是一个如何实现的示例:
import numpy as np
def disassemble_data(data):
# get some dense data and make it sparse
lengths = np.count_nonzero(data, axis=(1, 2, 3))
idxs = np.flatnonzero(data)
vals = data.ravel()[idxs]
return lengths, idxs, vals, data.shape
def assemble_data(lengths, idxs, vals, shape):
# get some sparse data and make it dense
data = np.zeros(shape)
lower_idx = 0
for length in lengths:
upper_idx = lower_idx + length
data.ravel()[idxs[lower_idx:upper_idx]] = vals[lower_idx:upper_idx]
lower_idx = upper_idx
return data
# Create some dummy data
my_data = np.random.uniform(0, 1, (10, 4, 32, 32))
my_data[my_data > 0.05] = 0
# make data sparse and then dense again
my_new_data = assemble_data(*disassemble_data(my_data))
# assert that this actually works
assert np.allclose(my_data, my_new_data)
现在,我们可以直接看到优势:数据是逐张加密的。这允许我们将整个数据集加载到 RAM 中并通过生成器按需生成密集图像:
def image_generator(lengths, idxs, vals, shape):
idxs %= np.prod(shape[1:])
lower_idx = 0
for length in lengths:
upper_idx = lower_idx + length
data = np.zeros(shape[1:])
data.ravel()[idxs[lower_idx:upper_idx]] = vals[lower_idx:upper_idx]
lower_idx = upper_idx
yield data
此外,还可以生成批量图像:
def image_batch_generator(lengths, idxs, vals, shape, batch_size):
idxs %= np.prod((batch_size, *shape[1:]))
lengths = np.sum(lengths.reshape(-1, batch_size), axis=1)
lower_idx = 0
for length in lengths:
upper_idx = lower_idx + length
data = np.zeros((batch_size, *shape[1:]))
data.ravel()[idxs[lower_idx:upper_idx]] = vals[lower_idx:upper_idx]
lower_idx = upper_idx
yield data
这是满足我的需要的一种相当方便的方法。但是我想知道是否有可能加快速度。
例如我看到 numpys 项集比直接分配更快(根据 docs)。但它只适用于单个项目,不适用于索引数组。
还有其他方法吗?我对 cython 等一点都不熟悉,所以我很乐意得到一些提示!
我测试了一些如何更有效地完成此操作,并得出结论,对于 np.random.uniform 生成的高度不相关的数据,您的方法一点也不差。在真实数据上,这可能有点不同。
我提高了你的功能的速度,压缩速度大约为 1.4GB/s,解压速度为 1.2GB/s,这一点也不差。使用 h5py (blosclz) 我只能达到大约 450MB/s,但也将数据写入磁盘。
改进的稀疏算法
import numpy as np
import numba as nb
#We can use uint16 on (4,32,32), since max. idx<2**16
@nb.jit()
def to_sparse_data_uint16(data):
data_flat=data.reshape(-1)
idx=np.empty(data.size,dtype=np.uint16)
data_out=np.empty(data.size,dtype=data.dtype)
ii=0
for i in range(data_flat.shape[0]):
if (data_flat[i]!=0):
idx[ii]=i
data_out[ii]=data_flat[i]
ii+=1
return idx[0:ii], data_out[0:ii], data.shape
def to_dense_data(idx,data,shape):
length=np.prod(shape)
data_out=np.zeros(length,dtype=data.dtype)
data_out[idx]=data
return data_out.reshape(shape)
########################
#do you really need float64 here?
images = np.array(np.random.uniform(0, 1, (100000, 4, 32, 32)),dtype=np.float32)
images[images > 0.05] = 0.
res=[]
t1=time.time()
for i in range(100000):
res.append(to_sparse_data_uint16(images[i,:,:,:]))
print(time.time()-t1)
t1=time.time()
for i in range(100000):
data=to_dense_data(res[i][0],res[i][1],res[i][2])
print(time.time()-t1)
HDF5 示例
import numpy as np
import tables #register blosc
import h5py as h5
import h5py_cache as h5c
import time
# I assume here that you don't need float64 for images..
# 1650MB Testdata
images = np.array(np.random.uniform(0, 1, (100000, 4, 32, 32)),dtype=np.float32)
images[images > 0.05] = 0.
#Write data (32,7 GB uncompressed)
hdf5_path='Test.h5'
f = h5c.File(hdf5_path, 'w',chunk_cache_mem_size=1024**2*100) #200 MB cache size
dset_images = f.create_dataset("images", shape=(20*100000, 4, 32, 32),dtype=np.float32,chunks=(1000, 4, 32, 32),compression=32001,compression_opts=(0, 0, 0, 0, 9, 1, 1), shuffle=False)
t1=time.time()
#Don't call h5py to often, this will lead to bad performance
for i in range(20):
dset_images[i*100000:(i+1)*100000,:,:,:]=images
f.close()
print(time.time()-t1)
print("MB/s: " + str(32700/(time.time()-t1)))