数据集大小小于内存,我的代码有什么问题?

Dataset size is smaller than memory, What's wrong with my code?

以下是部分代码,epoch=300,每个npz文件2.73M,但是我的dataloader的batch size给的是64,一共8gpuss,所以一个mini batch应该是64×8× 2.73M≈1.1G,我的实际内存是128G。即使解压后变大,也达不到128G的大小。下图link表示128G内存全部被占用。我应该如何更改我的代码?

class VimeoDataset(Dataset):
def __init__(self, dataset_name, batch_size=64):
    def __init__(self, dataset_name, batch_size=32):
    self.batch_size = batch_size
    self.path = '/data/train_sample/dataset/'
    self.dataset_name = dataset_name
    #self.load_data()
    self.h = 256
    self.w = 448
    xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
    yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
    self.grid = np.stack((xx,yy),2).copy()

def __len__(self):
    return len(self.meta_data)

def getimg(self, index):
    f = np.load('/data/train_sample/dataset/'+ str(index) + '.npz')
    if index < 8000:
        train_data = f['i0i1gt']
        flow_data = f['ft0ft1'] 
    elif 8000 <= index < 10000:
        val_data = f['i0i1gt']
    else:
        pass

    if self.dataset_name == 'train':
        meta_data = train_data
    else:
        meta_data = val_data

    data = meta_data
    img0 = data[0:3].transpose(1, 2, 0)
    img1 = data[3:6].transpose(1, 2, 0)
    gt = data[6:9].transpose(1, 2, 0)
    flow_gt = flow_data.transpose(1, 2, 0)
    return img0, gt, img1, flow_gt  

    dataset = VimeoDataset('train')
def __getitem__(self, index):        
    img0, gt, img1, flow_gt = self.getimg(index)
    ...

sampler = DistributedSampler(dataset) 
train_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=8, pin_memory=True, 
drop_last=True, sampler=sampler)

system usage figure

我已经根据我们上面的评论修复了你的数据集。本质上,您需要将更多变量传递到 class 中,以便它可以轻松区分您的训练数据和验证数据。这不会将所有数据加载到内存中,尽管有时这是必要的(顺序地,而不是一次)计算一些数据统计数据等。

免责声明:我猜测使用 glob 查找您的 npz 文件,并且您在验证集中使用 flow_data(验证数据的代码中缺少)。

from glob import glob
class VimeoDataset(Dataset):
    def __init__(self, npzs, batch_size=64,train_set=False):
        self.batch_size = batch_size
        self.train_set = train_set
        self.h = 256
        self.w = 448
        xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
        yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
        self.grid = np.stack((xx,yy),2).copy()
        self.npzs = npzs        

    def __len__(self):
        return len(self.npzs)
    
    def getimg(self, index):
        f = np.load(self.npzs[index])
        data = f['i0i1gt']
        if self.train_set:
            flow_data = f['ft0ft1']
        else: 
            flow_data = np.zeros([self.h,self.w,4]) 
        img0 = data[0:3].transpose(1, 2, 0)
        img1 = data[3:6].transpose(1, 2, 0)
        gt = data[6:9].transpose(1, 2, 0)
        flow_gt = flow_data.transpose(1, 2, 0)
        return img0, gt, img1, flow_gt  
    
    def __getitem__(self, index):        
        img0, gt, img1, flow_gt = self.getimg(index)

npzs = glob('/data/train_sample/dataset/*.npz')
train_val_split = 8000
train_dataset = VimeoDataset(npzs[:train_val_split],train_set = True)
val_dataset = VimeoDataset(npzs[train_val_split:])