如何将我自己的 class 对象存储到 hdf5 中?
How to store my own class object into hdf5?
我创建了一个 class 来保存我研究的实验结果(我是电子工程博士生),例如
class Trial:
def __init__(self, subID, triID):
self.filePath = '' # file path of the folder
self.subID = -1 # int
self.triID = -1 # int
self.data_A = -1 # numpy array
self.data_B = -1 # numpy array
......
它是许多 bool、int 和 numpy 数组的混合体。你明白了。我读到如果数据是 hdf5 格式,加载时会更快。我可以用我的数据来做吗,它是我的 Trial
对象的 python 列表?
请注意,Whosebug 上有一个similar question。但它只有一个答案,没有回答问题。相反,它将 OP 的自定义 class 分解为基本数据类型,并将它们存储到单独的数据集中。我不反对这样做,但我想知道这是否是唯一的方法,因为它违背了面向对象的哲学。
这是我用来保存数据的小 class。你可以通过做类似的事情来使用它..
dc = DataContainer()
dc.trials = <your list of trial objects here>
dc.save('mydata.pkl')
然后加载做..
dc = DataContainer.load('mydata.pkl')
这是 DataContainer 文件:
import gzip
import cPickle as pickle
# Simple container with load and save methods. Declare the container
# then add data to it. Save will save any data added to the container.
# The class automatically gzips the file if it ends in .gz
#
# Notes on size and speed (using UbuntuDialog data)
# pkl pkl.gz
# Save 11.4s 83.7s
# Load 4.8s 45.0s
# Size 596M 205M
#
class DataContainer(object):
@staticmethod
def isGZIP(filename):
if filename.split('.')[-1] == 'gz':
return True
return False
# Using HIGHEST_PROTOCOL is almost 2X faster and creates a file that
# is ~10% smaller. Load times go down by a factor of about 3X.
def save(self, filename='DataContainer.pkl'):
if self.isGZIP(filename):
f = gzip.open(filename, 'wb')
else:
f = open(filename, 'wb')
pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)
f.close()
# Note that loading to a string with pickle.loads is about 10% faster
# but probaly comsumes a lot more memory so we'll skip that for now.
@classmethod
def load(cls, filename='DataContainer.pkl'):
if cls.isGZIP(filename):
f = gzip.open(filename, 'rb')
else:
f = open(filename, 'rb')
n = pickle.load(f)
f.close()
return n
根据您的用例,您可以按照顶部所述将其用作基础 class,或者只需将 pickle.dump 行复制到您的代码中。
如果您确实有很多数据并且您没有在测试程序的每个 运行 中使用所有数据,还有一些其他选项,例如数据库,但以上是最好的假设您需要每个 运行.
的大部分数据的简单选项
我没有测试以下解决方案的速度和存储效率。 HDF5 确实支持 'compound datatypes' 可以与 numpy 'structured arrays' 一起使用,它支持混合变量类型,例如在 class 对象中遇到的。
"""
Created on Tue Dec 10 21:26:54 2019
@author: Christopher J. Burke
Give a worked example of saving a list of class objects with mixed
storage types to a HDF5 file and reading in file back to a list of class
objects. The solution is inspired by this bug report
https://github.com/h5py/h5py/issues/735
and the numpy and hdf5 documentation
"""
import numpy as np
import h5py
class test_object:
""" Define a storage class that keeps info that we want to record
for every object
"""
# explictly state the name, datatype and shape for every
# class variable
# The names MUST exactly match the class variable names in the __init__
store_names = ['a', 'b', 'c', 'd', 'e']
store_types = ['i8', 'i4', 'f8', 'S80', 'f8']
store_shapes = [None, None, None, None, [4]]
# Make the tuples that will define the numpy structured array
# https://docs.scipy.org/doc/numpy/user/basics.rec.html
sz = len(store_names)
store_def_tuples = []
for i in range(sz):
if store_shapes[i] is not None:
store_def_tuples.append((store_names[i], store_types[i], store_shapes[i]))
else:
store_def_tuples.append((store_names[i], store_types[i]))
# Actually define the numpy structured/compound data type
store_struct_numpy_dtype = np.dtype(store_def_tuples)
def __init__(self):
self.a = 0
self.b = 0
self.c = 0.0
self.d = '0'
self.e = [0.0, 0.0, 0.0, 0.0]
def store_objlist_as_hd5f(self, objlist, fileName):
"""Function to save the class structure into hdf5
objlist - is a list of the test_objects
fileName - is the h5 filename for output
"""
# First create the array of numpy structered arrays
np_dset = np.ndarray(len(objlist), dtype=self.store_struct_numpy_dtype)
# Convert the class variables into the numpy structured dtype
for i, curobj in enumerate(objlist):
for j in range(len(self.store_names)):
np_dset[i][self.store_names[j]] = getattr(curobj, self.store_names[j])
# Data set should be all loaded ready to write out
fp = h5py.File(fileName, 'w')
hf_dset = fp.create_dataset('dset', shape=(len(objlist),), dtype=self.store_struct_numpy_dtype)
hf_dset[:] = np_dset
fp.close()
def fill_objlist_from_hd5f(self, fileName):
""" Function to read in the hdf5 file created by store_objlist_as_hdf5
and store the contents into a list of test_objects
fileName - si the h5 filename for input
"""
fp = h5py.File(fileName, 'r')
np_dset = np.array(fp['dset'])
# Start with empty list
all_objs = []
# iterate through the numpy structured array and save to objects
for i in range(len(np_dset)):
tmp = test_object()
for j in range(len(self.store_names)):
setattr(tmp, self.store_names[j], np_dset[i][self.store_names[j]])
# Append object to list
all_objs.append(tmp)
return all_objs
if __name__ == '__main__':
all_objs = []
for i in range(3):
# instantiate tce_seed object
tmp = test_object()
# Put in some dummy data into object
tmp.a = int(i)
tmp.b = int(i)
tmp.c = float(i)
tmp.d = '{0} {0} {0} {0}'.format(i)
tmp.e = np.full([4], i, dtype=np.float)
all_objs.append(tmp)
# Write out hd5 file
tmp.store_objlist_as_hd5f(all_objs, 'test_write.h5')
# Read in hd5 file
all_objs = []
all_objs = tmp.fill_objlist_from_hd5f('test_write.h5')
# verify the output is as expected
for i, curobj in enumerate(all_objs):
print('Object {0:d}'.format(i))
print('{0:d} {1:d} {2:f}'.format(curobj.a, curobj.b, curobj.c))
print('{0} {1}'.format(curobj.d.decode('ASCII'), curobj.e))
我创建了一个 class 来保存我研究的实验结果(我是电子工程博士生),例如
class Trial:
def __init__(self, subID, triID):
self.filePath = '' # file path of the folder
self.subID = -1 # int
self.triID = -1 # int
self.data_A = -1 # numpy array
self.data_B = -1 # numpy array
......
它是许多 bool、int 和 numpy 数组的混合体。你明白了。我读到如果数据是 hdf5 格式,加载时会更快。我可以用我的数据来做吗,它是我的 Trial
对象的 python 列表?
请注意,Whosebug 上有一个similar question。但它只有一个答案,没有回答问题。相反,它将 OP 的自定义 class 分解为基本数据类型,并将它们存储到单独的数据集中。我不反对这样做,但我想知道这是否是唯一的方法,因为它违背了面向对象的哲学。
这是我用来保存数据的小 class。你可以通过做类似的事情来使用它..
dc = DataContainer()
dc.trials = <your list of trial objects here>
dc.save('mydata.pkl')
然后加载做..
dc = DataContainer.load('mydata.pkl')
这是 DataContainer 文件:
import gzip
import cPickle as pickle
# Simple container with load and save methods. Declare the container
# then add data to it. Save will save any data added to the container.
# The class automatically gzips the file if it ends in .gz
#
# Notes on size and speed (using UbuntuDialog data)
# pkl pkl.gz
# Save 11.4s 83.7s
# Load 4.8s 45.0s
# Size 596M 205M
#
class DataContainer(object):
@staticmethod
def isGZIP(filename):
if filename.split('.')[-1] == 'gz':
return True
return False
# Using HIGHEST_PROTOCOL is almost 2X faster and creates a file that
# is ~10% smaller. Load times go down by a factor of about 3X.
def save(self, filename='DataContainer.pkl'):
if self.isGZIP(filename):
f = gzip.open(filename, 'wb')
else:
f = open(filename, 'wb')
pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)
f.close()
# Note that loading to a string with pickle.loads is about 10% faster
# but probaly comsumes a lot more memory so we'll skip that for now.
@classmethod
def load(cls, filename='DataContainer.pkl'):
if cls.isGZIP(filename):
f = gzip.open(filename, 'rb')
else:
f = open(filename, 'rb')
n = pickle.load(f)
f.close()
return n
根据您的用例,您可以按照顶部所述将其用作基础 class,或者只需将 pickle.dump 行复制到您的代码中。
如果您确实有很多数据并且您没有在测试程序的每个 运行 中使用所有数据,还有一些其他选项,例如数据库,但以上是最好的假设您需要每个 运行.
的大部分数据的简单选项我没有测试以下解决方案的速度和存储效率。 HDF5 确实支持 'compound datatypes' 可以与 numpy 'structured arrays' 一起使用,它支持混合变量类型,例如在 class 对象中遇到的。
"""
Created on Tue Dec 10 21:26:54 2019
@author: Christopher J. Burke
Give a worked example of saving a list of class objects with mixed
storage types to a HDF5 file and reading in file back to a list of class
objects. The solution is inspired by this bug report
https://github.com/h5py/h5py/issues/735
and the numpy and hdf5 documentation
"""
import numpy as np
import h5py
class test_object:
""" Define a storage class that keeps info that we want to record
for every object
"""
# explictly state the name, datatype and shape for every
# class variable
# The names MUST exactly match the class variable names in the __init__
store_names = ['a', 'b', 'c', 'd', 'e']
store_types = ['i8', 'i4', 'f8', 'S80', 'f8']
store_shapes = [None, None, None, None, [4]]
# Make the tuples that will define the numpy structured array
# https://docs.scipy.org/doc/numpy/user/basics.rec.html
sz = len(store_names)
store_def_tuples = []
for i in range(sz):
if store_shapes[i] is not None:
store_def_tuples.append((store_names[i], store_types[i], store_shapes[i]))
else:
store_def_tuples.append((store_names[i], store_types[i]))
# Actually define the numpy structured/compound data type
store_struct_numpy_dtype = np.dtype(store_def_tuples)
def __init__(self):
self.a = 0
self.b = 0
self.c = 0.0
self.d = '0'
self.e = [0.0, 0.0, 0.0, 0.0]
def store_objlist_as_hd5f(self, objlist, fileName):
"""Function to save the class structure into hdf5
objlist - is a list of the test_objects
fileName - is the h5 filename for output
"""
# First create the array of numpy structered arrays
np_dset = np.ndarray(len(objlist), dtype=self.store_struct_numpy_dtype)
# Convert the class variables into the numpy structured dtype
for i, curobj in enumerate(objlist):
for j in range(len(self.store_names)):
np_dset[i][self.store_names[j]] = getattr(curobj, self.store_names[j])
# Data set should be all loaded ready to write out
fp = h5py.File(fileName, 'w')
hf_dset = fp.create_dataset('dset', shape=(len(objlist),), dtype=self.store_struct_numpy_dtype)
hf_dset[:] = np_dset
fp.close()
def fill_objlist_from_hd5f(self, fileName):
""" Function to read in the hdf5 file created by store_objlist_as_hdf5
and store the contents into a list of test_objects
fileName - si the h5 filename for input
"""
fp = h5py.File(fileName, 'r')
np_dset = np.array(fp['dset'])
# Start with empty list
all_objs = []
# iterate through the numpy structured array and save to objects
for i in range(len(np_dset)):
tmp = test_object()
for j in range(len(self.store_names)):
setattr(tmp, self.store_names[j], np_dset[i][self.store_names[j]])
# Append object to list
all_objs.append(tmp)
return all_objs
if __name__ == '__main__':
all_objs = []
for i in range(3):
# instantiate tce_seed object
tmp = test_object()
# Put in some dummy data into object
tmp.a = int(i)
tmp.b = int(i)
tmp.c = float(i)
tmp.d = '{0} {0} {0} {0}'.format(i)
tmp.e = np.full([4], i, dtype=np.float)
all_objs.append(tmp)
# Write out hd5 file
tmp.store_objlist_as_hd5f(all_objs, 'test_write.h5')
# Read in hd5 file
all_objs = []
all_objs = tmp.fill_objlist_from_hd5f('test_write.h5')
# verify the output is as expected
for i, curobj in enumerate(all_objs):
print('Object {0:d}'.format(i))
print('{0:d} {1:d} {2:f}'.format(curobj.a, curobj.b, curobj.c))
print('{0} {1}'.format(curobj.d.decode('ASCII'), curobj.e))