批量读写,从文本文件到HDF5 python
Batch reading and writing, from textfile to HDF5 in python
目标是将大型数据集提供给 Tensorflow。我来到了以下实现。然而,虽然 HDF5 的 io 应该非常快,但我的实现却很慢。这是因为没有使用 chunks 功能吗?我似乎没有得到适合块的尺寸,我应该将其视为第三维吗?像; (4096, 7, 1000) 对于块大小 1000?
请注意,我可以通过为单个生成器找到解决方案来进一步简化下面的代码。但是,我认为 data/label 组合很常见,对其他人也很有用。
我使用以下函数创建了两个生成器,一个用于数据,一个用于相应的标签。
def read_chunks(file, dim, batch_size=batch_size):
chunk = np.empty(dim,)
current_size = 1
# read input file line by line
for line in file:
current_size += 1
# build chunk
chunk = np.vstack((chunk, np.genfromtxt(io.BytesIO(line.encode()))))
# reaches batch size
if current_size == batch_size:
yield chunk
# reset counters
current_size = 1
chunk = np.empty(dim,)
然后我希望将这些生成器生成的数据和标签移动到 HDF5。
def write_h5(data_gen, label_gen, out_file, batch_size, h5_batch_size, data_dtype, label_dtype):
# remove existing file
if os.path.isfile(out_file):
os.remove(out_file)
with h5py.File(out_file, 'a') as f:
# create a dataset and labelset in the same file
d = f.create_dataset('data', (batch_size,data_dim), maxshape=(None,data_dim), dtype=data_dtype)
l = f.create_dataset('label', (batch_size,label_dim), maxshape=(None,label_dim), dtype=label_dtype)
# use generators to fill both sets
for data in data_gen:
d.resize(d.shape[0]+batch_size, axis=0)
d[-batch_size:] = data
l.resize(l.shape[0]+batch_size, axis=0)
l[-batch_size:] = next(label_gen)
使用以下常量,我像这样组合了两个函数;
batch_size = 4096
h5_batch_size = 1000
data_dim = 7 #[NUM_POINT, 9]
label_dim = 1 #[NUM_POINT]
data_dtype = 'float32'
label_dtype = 'uint8'
for data_file, label_file in data_label_files:
print(data_file)
with open(data_file, 'r') as data_f, open(label_file, 'r') as label_f:
data_gen = read_chunks(data_f, dim=data_dim)
label_gen = read_chunks(label_f, dim=label_dim)
out_file = data_file[:-4] + '.h5'
write_h5(data_gen, label_gen, out_file, batch_size, h5_batch_size, data_dtype, label_dtype)
问题不在于 HDF5 速度慢。问题是您使用 Python 循环一次读取一行,每行调用一次 genfromtxt()
!该功能旨在读取整个文件。然后在同一个循环中使用反模式“array = vstack(array, newstuff)”。
简而言之,您的性能问题从这里开始:
chunk = np.vstack((chunk, np.genfromtxt(io.BytesIO(line.encode()))))
您应该一次阅读整个文件。如果不行就读一半(可以设置每次读的最大行数,比如100万行)。
目标是将大型数据集提供给 Tensorflow。我来到了以下实现。然而,虽然 HDF5 的 io 应该非常快,但我的实现却很慢。这是因为没有使用 chunks 功能吗?我似乎没有得到适合块的尺寸,我应该将其视为第三维吗?像; (4096, 7, 1000) 对于块大小 1000?
请注意,我可以通过为单个生成器找到解决方案来进一步简化下面的代码。但是,我认为 data/label 组合很常见,对其他人也很有用。
我使用以下函数创建了两个生成器,一个用于数据,一个用于相应的标签。
def read_chunks(file, dim, batch_size=batch_size):
chunk = np.empty(dim,)
current_size = 1
# read input file line by line
for line in file:
current_size += 1
# build chunk
chunk = np.vstack((chunk, np.genfromtxt(io.BytesIO(line.encode()))))
# reaches batch size
if current_size == batch_size:
yield chunk
# reset counters
current_size = 1
chunk = np.empty(dim,)
然后我希望将这些生成器生成的数据和标签移动到 HDF5。
def write_h5(data_gen, label_gen, out_file, batch_size, h5_batch_size, data_dtype, label_dtype):
# remove existing file
if os.path.isfile(out_file):
os.remove(out_file)
with h5py.File(out_file, 'a') as f:
# create a dataset and labelset in the same file
d = f.create_dataset('data', (batch_size,data_dim), maxshape=(None,data_dim), dtype=data_dtype)
l = f.create_dataset('label', (batch_size,label_dim), maxshape=(None,label_dim), dtype=label_dtype)
# use generators to fill both sets
for data in data_gen:
d.resize(d.shape[0]+batch_size, axis=0)
d[-batch_size:] = data
l.resize(l.shape[0]+batch_size, axis=0)
l[-batch_size:] = next(label_gen)
使用以下常量,我像这样组合了两个函数;
batch_size = 4096
h5_batch_size = 1000
data_dim = 7 #[NUM_POINT, 9]
label_dim = 1 #[NUM_POINT]
data_dtype = 'float32'
label_dtype = 'uint8'
for data_file, label_file in data_label_files:
print(data_file)
with open(data_file, 'r') as data_f, open(label_file, 'r') as label_f:
data_gen = read_chunks(data_f, dim=data_dim)
label_gen = read_chunks(label_f, dim=label_dim)
out_file = data_file[:-4] + '.h5'
write_h5(data_gen, label_gen, out_file, batch_size, h5_batch_size, data_dtype, label_dtype)
问题不在于 HDF5 速度慢。问题是您使用 Python 循环一次读取一行,每行调用一次 genfromtxt()
!该功能旨在读取整个文件。然后在同一个循环中使用反模式“array = vstack(array, newstuff)”。
简而言之,您的性能问题从这里开始:
chunk = np.vstack((chunk, np.genfromtxt(io.BytesIO(line.encode()))))
您应该一次阅读整个文件。如果不行就读一半(可以设置每次读的最大行数,比如100万行)。