Python H5 中读取大文件夹图片和转换时如何防止内存填满

How to Prevent Memory Filling Up When Reading in Large Folder of Images & Converting in H5 in Python

我正在尝试读入 ~1.1M 图像和浮点对,并将它们保存在 .h5 文件块中,以便稍后由 keras 生成器读入。正如您在下面看到的,我试图向 python 解释我想要一次读入 25,000 张图像并保存到上述 .h5 文件中。出于某种原因,我的 ram 在第 22 圈左右充满了。

我对 GC 的调用在每次循环后稍微降低了内存使用量,但我似乎没有针对要清除的正确变量,因为内存向前两步后退一步。


def augment_clean_data_master(master_data):
    data_list = master_data
    print("Length of Data List: ",len(data_list))
    num_processors = 8
    print("Doing Flipped Image Data.")
    p=Pool(processes = num_processors)
    flipped_output = p_map(get_data_from_line_master_flipped,data_list)
    print("Doing OG Image Data.")
    clean_output = p_map(get_data_from_line_master,data_list)
    print("Merging Data.")

    aug = clean_output + flipped_output
    return aug

def augment_data_and_save_as_hdf5():
    #take master data, augment it, save in hdf5 format
    file_name = "./master/master.txt"

    #read in image name list
    with open(file_name) as f:
        data_list = f.readlines()
        #make sublists of [angle, name, speed]
        data_list = [x.split(',') for x in data_list]

    #make a for loop that goes in increments of 25000 & makes that data
    #& appends it to the .h5 array
    for i in range(int(len(data_list)/25000)):
        #these two indexes are always 25000 in size apart
        start_index = i * 25000
        stop_index = (i+1) * 25000

        print("Start Index: ", start_index/25000)
        #read in the lines from the text data & pull necessary images and angles
        partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:stop_index])

        #if we're at the last chunk, then use the starting index to the end using a :
        if (i == int(len(data_list)/25000)-1):
            partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:])

        #make empty lists to store images temporarily
        X = []
        y = []

        #cull the error lines and pull out the images and angles
        for element in partial_augmented_image_data_list:
                if (element[1] == 'None'):

        X = np.array(X).reshape(-1, 66, 200, 3)
        y = np.array(y).reshape(len(y), 1)

        print("X Shape: ", X.shape)
        print("y Shape: ", y.shape)

        #if it's the first interaction with saving the data, then you have to instantiate the files
        #if start_index == 0:
        with h5py.File('./masterArrays/Data_Chunk_' + str(i) + '.h5', 'w') as hf:
            hf.create_dataset("X", data=X, compression="gzip", chunks=True, maxshape=(50000,66, 200, 3))
            hf.create_dataset("y", data=y, compression="gzip", chunks=True, maxshape=(50000,1))

        #if we're at the last chunk, then use the starting index to the end using a :
            with h5py.File('./masterArrays/Data.h5', 'a') as hf:
                hf["X"].resize((hf["X"].shape[0] + X.shape[0]), axis = 0)
                hf["X"][-X.shape[0]:] = X

                hf["y"].resize((hf["y"].shape[0] + y.shape[0]), axis = 0)
                hf["y"][-X.shape[0]:] = y
        #clearing ram manually here cause the GC was fucking up
        del X
        del y
        import gc

    print("Augmented & Saved Arrays.")

最后,我只是将处理图像的方法包装成 for 循环中的块,并在每次迭代中启动并加入一个进程。这样我们就知道当进程死亡时,它更有可能释放它正在使用的内存。看起来效果不错。

def mrMeeseeks(i,chunk_size, data_list):
    #these two indexes are always chunk_size in size apart
    start_index = i * chunk_size
    stop_index = (i+1) * chunk_size

    print("Start Index: ", start_index/chunk_size)
    #read in the lines from the text data & pull necessary images and angles
    partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:stop_index])

    #if we're at the last chunk, then use the starting index to the end using a :
    if (i == int(len(data_list)/chunk_size)-1):
        partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:])

    #make empty lists to store images temporarily
    X = np.empty([chunk_size,66,200,3])
    y = np.empty([chunk_size,1])

    #cull the error lines and pull out the images and angles

    for count, element in enumerate(partial_augmented_image_data_list):
            if (element[1] == 'None'):
                print("FUCKED UP, KEEP THE CHECK")
                X[count] = element[0]
                y[count] = float(element[1])

    X = X.reshape(-1, 66, 200, 3)
    y = y.reshape(len(y), 1)

    #if it's the first interaction with saving the data, then you have to instantiate the files
    #if start_index == 0:
    with h5py.File('./masterArrays/Data_Chunk_' + str(i) + '.h5', 'w') as hf:
        hf.create_dataset("X", data=X, compression="gzip", chunks=True, maxshape=(50000,66, 200, 3))
        hf.create_dataset("y", data=y, compression="gzip", chunks=True, maxshape=(50000,1))

    #if we're at the last chunk, then use the starting index to the end using a :
        with h5py.File('./masterArrays/Data.h5', 'a') as hf:
            hf["X"].resize((hf["X"].shape[0] + X.shape[0]), axis = 0)
            hf["X"][-X.shape[0]:] = X

            hf["y"].resize((hf["y"].shape[0] + y.shape[0]), axis = 0)
            hf["y"][-X.shape[0]:] = y

    print("X Shape: ", X.shape)
    print("y Shape: ", y.shape)


    #clearing ram manually here cause the GC was fucking up
    del X
    del y
    del partial_augmented_image_data_list
    import gc

import multiprocessing

def augment_data_and_save_as_hdf5():
    #take master data, augment it, save in hdf5 format
    file_name = "./master/master.txt"

    #read in image name list
    with open(file_name) as f:
        data_list = f.readlines()
        #make sublists of [angle, name, speed]
        data_list = [x.split(',') for x in data_list]

    chunk_size = 25000

    #make a for loop that goes in increments of chunk_size & makes that data
    #& appends it to the .h5 array
    for i in range(int(len(data_list)/chunk_size)):
        #start process
        p = multiprocessing.Process(target=mrMeeseeks, args=(i,chunk_size,data_list,))
        #add a .join command every 6 iterations so we can use only 6 cores and make sure each one works then finishes.