Python H5 中读取大文件夹图片和转换时如何防止内存填满
How to Prevent Memory Filling Up When Reading in Large Folder of Images & Converting in H5 in Python
我正在尝试读入 ~1.1M 图像和浮点对,并将它们保存在 .h5 文件块中,以便稍后由 keras 生成器读入。正如您在下面看到的,我试图向 python 解释我想要一次读入 25,000 张图像并保存到上述 .h5 文件中。出于某种原因,我的 ram 在第 22 圈左右充满了。
我对 GC 的调用在每次循环后稍微降低了内存使用量,但我似乎没有针对要清除的正确变量,因为内存向前两步后退一步。
如有任何帮助,我们将不胜感激!
def augment_clean_data_master(master_data):
data_list = master_data
print("Length of Data List: ",len(data_list))
num_processors = 8
print("Doing Flipped Image Data.")
p=Pool(processes = num_processors)
flipped_output = p_map(get_data_from_line_master_flipped,data_list)
print("Doing OG Image Data.")
clean_output = p_map(get_data_from_line_master,data_list)
print("Merging Data.")
aug = clean_output + flipped_output
return aug
def augment_data_and_save_as_hdf5():
#take master data, augment it, save in hdf5 format
file_name = "./master/master.txt"
#read in image name list
with open(file_name) as f:
data_list = f.readlines()
#make sublists of [angle, name, speed]
data_list = [x.split(',') for x in data_list]
f.close()
#make a for loop that goes in increments of 25000 & makes that data
#& appends it to the .h5 array
for i in range(int(len(data_list)/25000)):
#these two indexes are always 25000 in size apart
start_index = i * 25000
stop_index = (i+1) * 25000
print("*"*30)
print("Start Index: ", start_index/25000)
#read in the lines from the text data & pull necessary images and angles
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:stop_index])
#if we're at the last chunk, then use the starting index to the end using a :
if (i == int(len(data_list)/25000)-1):
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:])
#make empty lists to store images temporarily
X = []
y = []
#cull the error lines and pull out the images and angles
for element in partial_augmented_image_data_list:
try:
if (element[1] == 'None'):
pass
else:
X.append(element[0])
y.append(float(element[1]))
except:
pass
X = np.array(X).reshape(-1, 66, 200, 3)
y = np.array(y).reshape(len(y), 1)
print("X Shape: ", X.shape)
print("y Shape: ", y.shape)
#if it's the first interaction with saving the data, then you have to instantiate the files
#if start_index == 0:
with h5py.File('./masterArrays/Data_Chunk_' + str(i) + '.h5', 'w') as hf:
hf.create_dataset("X", data=X, compression="gzip", chunks=True, maxshape=(50000,66, 200, 3))
hf.create_dataset("y", data=y, compression="gzip", chunks=True, maxshape=(50000,1))
#if we're at the last chunk, then use the starting index to the end using a :
'''
else:
with h5py.File('./masterArrays/Data.h5', 'a') as hf:
hf["X"].resize((hf["X"].shape[0] + X.shape[0]), axis = 0)
hf["X"][-X.shape[0]:] = X
hf["y"].resize((hf["y"].shape[0] + y.shape[0]), axis = 0)
hf["y"][-X.shape[0]:] = y
'''
#clearing ram manually here cause the GC was fucking up
del X
del y
import gc
gc.collect()
print("Augmented & Saved Arrays.")
最后,我只是将处理图像的方法包装成 for 循环中的块,并在每次迭代中启动并加入一个进程。这样我们就知道当进程死亡时,它更有可能释放它正在使用的内存。看起来效果不错。
def mrMeeseeks(i,chunk_size, data_list):
#these two indexes are always chunk_size in size apart
start_index = i * chunk_size
stop_index = (i+1) * chunk_size
print("*"*30)
print("Start Index: ", start_index/chunk_size)
#read in the lines from the text data & pull necessary images and angles
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:stop_index])
#if we're at the last chunk, then use the starting index to the end using a :
if (i == int(len(data_list)/chunk_size)-1):
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:])
#make empty lists to store images temporarily
X = np.empty([chunk_size,66,200,3])
y = np.empty([chunk_size,1])
#cull the error lines and pull out the images and angles
for count, element in enumerate(partial_augmented_image_data_list):
try:
if (element[1] == 'None'):
print("FUCKED UP, KEEP THE CHECK")
pass
else:
X[count] = element[0]
y[count] = float(element[1])
except:
pass
X = X.reshape(-1, 66, 200, 3)
y = y.reshape(len(y), 1)
#if it's the first interaction with saving the data, then you have to instantiate the files
#if start_index == 0:
with h5py.File('./masterArrays/Data_Chunk_' + str(i) + '.h5', 'w') as hf:
hf.create_dataset("X", data=X, compression="gzip", chunks=True, maxshape=(50000,66, 200, 3))
hf.create_dataset("y", data=y, compression="gzip", chunks=True, maxshape=(50000,1))
#if we're at the last chunk, then use the starting index to the end using a :
'''
else:
with h5py.File('./masterArrays/Data.h5', 'a') as hf:
hf["X"].resize((hf["X"].shape[0] + X.shape[0]), axis = 0)
hf["X"][-X.shape[0]:] = X
hf["y"].resize((hf["y"].shape[0] + y.shape[0]), axis = 0)
hf["y"][-X.shape[0]:] = y
'''
print("X Shape: ", X.shape)
print("y Shape: ", y.shape)
print(X[0])
print(y[0])
print("((((((((((()))))))))))")
print(X[1])
print(y[1])
#clearing ram manually here cause the GC was fucking up
del X
del y
del partial_augmented_image_data_list
import gc
gc.collect()
import multiprocessing
def augment_data_and_save_as_hdf5():
#take master data, augment it, save in hdf5 format
file_name = "./master/master.txt"
#read in image name list
with open(file_name) as f:
data_list = f.readlines()
#make sublists of [angle, name, speed]
data_list = [x.split(',') for x in data_list]
f.close()
chunk_size = 25000
#make a for loop that goes in increments of chunk_size & makes that data
#& appends it to the .h5 array
for i in range(int(len(data_list)/chunk_size)):
#start process
p = multiprocessing.Process(target=mrMeeseeks, args=(i,chunk_size,data_list,))
p.start()
p.join()
#add a .join command every 6 iterations so we can use only 6 cores and make sure each one works then finishes.
我正在尝试读入 ~1.1M 图像和浮点对,并将它们保存在 .h5 文件块中,以便稍后由 keras 生成器读入。正如您在下面看到的,我试图向 python 解释我想要一次读入 25,000 张图像并保存到上述 .h5 文件中。出于某种原因,我的 ram 在第 22 圈左右充满了。
我对 GC 的调用在每次循环后稍微降低了内存使用量,但我似乎没有针对要清除的正确变量,因为内存向前两步后退一步。
如有任何帮助,我们将不胜感激!
def augment_clean_data_master(master_data):
data_list = master_data
print("Length of Data List: ",len(data_list))
num_processors = 8
print("Doing Flipped Image Data.")
p=Pool(processes = num_processors)
flipped_output = p_map(get_data_from_line_master_flipped,data_list)
print("Doing OG Image Data.")
clean_output = p_map(get_data_from_line_master,data_list)
print("Merging Data.")
aug = clean_output + flipped_output
return aug
def augment_data_and_save_as_hdf5():
#take master data, augment it, save in hdf5 format
file_name = "./master/master.txt"
#read in image name list
with open(file_name) as f:
data_list = f.readlines()
#make sublists of [angle, name, speed]
data_list = [x.split(',') for x in data_list]
f.close()
#make a for loop that goes in increments of 25000 & makes that data
#& appends it to the .h5 array
for i in range(int(len(data_list)/25000)):
#these two indexes are always 25000 in size apart
start_index = i * 25000
stop_index = (i+1) * 25000
print("*"*30)
print("Start Index: ", start_index/25000)
#read in the lines from the text data & pull necessary images and angles
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:stop_index])
#if we're at the last chunk, then use the starting index to the end using a :
if (i == int(len(data_list)/25000)-1):
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:])
#make empty lists to store images temporarily
X = []
y = []
#cull the error lines and pull out the images and angles
for element in partial_augmented_image_data_list:
try:
if (element[1] == 'None'):
pass
else:
X.append(element[0])
y.append(float(element[1]))
except:
pass
X = np.array(X).reshape(-1, 66, 200, 3)
y = np.array(y).reshape(len(y), 1)
print("X Shape: ", X.shape)
print("y Shape: ", y.shape)
#if it's the first interaction with saving the data, then you have to instantiate the files
#if start_index == 0:
with h5py.File('./masterArrays/Data_Chunk_' + str(i) + '.h5', 'w') as hf:
hf.create_dataset("X", data=X, compression="gzip", chunks=True, maxshape=(50000,66, 200, 3))
hf.create_dataset("y", data=y, compression="gzip", chunks=True, maxshape=(50000,1))
#if we're at the last chunk, then use the starting index to the end using a :
'''
else:
with h5py.File('./masterArrays/Data.h5', 'a') as hf:
hf["X"].resize((hf["X"].shape[0] + X.shape[0]), axis = 0)
hf["X"][-X.shape[0]:] = X
hf["y"].resize((hf["y"].shape[0] + y.shape[0]), axis = 0)
hf["y"][-X.shape[0]:] = y
'''
#clearing ram manually here cause the GC was fucking up
del X
del y
import gc
gc.collect()
print("Augmented & Saved Arrays.")
最后,我只是将处理图像的方法包装成 for 循环中的块,并在每次迭代中启动并加入一个进程。这样我们就知道当进程死亡时,它更有可能释放它正在使用的内存。看起来效果不错。
def mrMeeseeks(i,chunk_size, data_list):
#these two indexes are always chunk_size in size apart
start_index = i * chunk_size
stop_index = (i+1) * chunk_size
print("*"*30)
print("Start Index: ", start_index/chunk_size)
#read in the lines from the text data & pull necessary images and angles
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:stop_index])
#if we're at the last chunk, then use the starting index to the end using a :
if (i == int(len(data_list)/chunk_size)-1):
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:])
#make empty lists to store images temporarily
X = np.empty([chunk_size,66,200,3])
y = np.empty([chunk_size,1])
#cull the error lines and pull out the images and angles
for count, element in enumerate(partial_augmented_image_data_list):
try:
if (element[1] == 'None'):
print("FUCKED UP, KEEP THE CHECK")
pass
else:
X[count] = element[0]
y[count] = float(element[1])
except:
pass
X = X.reshape(-1, 66, 200, 3)
y = y.reshape(len(y), 1)
#if it's the first interaction with saving the data, then you have to instantiate the files
#if start_index == 0:
with h5py.File('./masterArrays/Data_Chunk_' + str(i) + '.h5', 'w') as hf:
hf.create_dataset("X", data=X, compression="gzip", chunks=True, maxshape=(50000,66, 200, 3))
hf.create_dataset("y", data=y, compression="gzip", chunks=True, maxshape=(50000,1))
#if we're at the last chunk, then use the starting index to the end using a :
'''
else:
with h5py.File('./masterArrays/Data.h5', 'a') as hf:
hf["X"].resize((hf["X"].shape[0] + X.shape[0]), axis = 0)
hf["X"][-X.shape[0]:] = X
hf["y"].resize((hf["y"].shape[0] + y.shape[0]), axis = 0)
hf["y"][-X.shape[0]:] = y
'''
print("X Shape: ", X.shape)
print("y Shape: ", y.shape)
print(X[0])
print(y[0])
print("((((((((((()))))))))))")
print(X[1])
print(y[1])
#clearing ram manually here cause the GC was fucking up
del X
del y
del partial_augmented_image_data_list
import gc
gc.collect()
import multiprocessing
def augment_data_and_save_as_hdf5():
#take master data, augment it, save in hdf5 format
file_name = "./master/master.txt"
#read in image name list
with open(file_name) as f:
data_list = f.readlines()
#make sublists of [angle, name, speed]
data_list = [x.split(',') for x in data_list]
f.close()
chunk_size = 25000
#make a for loop that goes in increments of chunk_size & makes that data
#& appends it to the .h5 array
for i in range(int(len(data_list)/chunk_size)):
#start process
p = multiprocessing.Process(target=mrMeeseeks, args=(i,chunk_size,data_list,))
p.start()
p.join()
#add a .join command every 6 iterations so we can use only 6 cores and make sure each one works then finishes.