数据集大小小于内存,我的代码有什么问题?
Dataset size is smaller than memory, What's wrong with my code?
以下是部分代码,epoch=300,每个npz文件2.73M,但是我的dataloader的batch size给的是64,一共8gpuss,所以一个mini batch应该是64×8× 2.73M≈1.1G,我的实际内存是128G。即使解压后变大,也达不到128G的大小。下图link表示128G内存全部被占用。我应该如何更改我的代码?
class VimeoDataset(Dataset):
def __init__(self, dataset_name, batch_size=64):
def __init__(self, dataset_name, batch_size=32):
self.batch_size = batch_size
self.path = '/data/train_sample/dataset/'
self.dataset_name = dataset_name
#self.load_data()
self.h = 256
self.w = 448
xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
self.grid = np.stack((xx,yy),2).copy()
def __len__(self):
return len(self.meta_data)
def getimg(self, index):
f = np.load('/data/train_sample/dataset/'+ str(index) + '.npz')
if index < 8000:
train_data = f['i0i1gt']
flow_data = f['ft0ft1']
elif 8000 <= index < 10000:
val_data = f['i0i1gt']
else:
pass
if self.dataset_name == 'train':
meta_data = train_data
else:
meta_data = val_data
data = meta_data
img0 = data[0:3].transpose(1, 2, 0)
img1 = data[3:6].transpose(1, 2, 0)
gt = data[6:9].transpose(1, 2, 0)
flow_gt = flow_data.transpose(1, 2, 0)
return img0, gt, img1, flow_gt
dataset = VimeoDataset('train')
def __getitem__(self, index):
img0, gt, img1, flow_gt = self.getimg(index)
...
sampler = DistributedSampler(dataset)
train_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=8, pin_memory=True,
drop_last=True, sampler=sampler)
system usage figure
我已经根据我们上面的评论修复了你的数据集。本质上,您需要将更多变量传递到 class 中,以便它可以轻松区分您的训练数据和验证数据。这不会将所有数据加载到内存中,尽管有时这是必要的(顺序地,而不是一次)计算一些数据统计数据等。
免责声明:我猜测使用 glob 查找您的 npz 文件,并且您在验证集中使用 flow_data(验证数据的代码中缺少)。
from glob import glob
class VimeoDataset(Dataset):
def __init__(self, npzs, batch_size=64,train_set=False):
self.batch_size = batch_size
self.train_set = train_set
self.h = 256
self.w = 448
xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
self.grid = np.stack((xx,yy),2).copy()
self.npzs = npzs
def __len__(self):
return len(self.npzs)
def getimg(self, index):
f = np.load(self.npzs[index])
data = f['i0i1gt']
if self.train_set:
flow_data = f['ft0ft1']
else:
flow_data = np.zeros([self.h,self.w,4])
img0 = data[0:3].transpose(1, 2, 0)
img1 = data[3:6].transpose(1, 2, 0)
gt = data[6:9].transpose(1, 2, 0)
flow_gt = flow_data.transpose(1, 2, 0)
return img0, gt, img1, flow_gt
def __getitem__(self, index):
img0, gt, img1, flow_gt = self.getimg(index)
npzs = glob('/data/train_sample/dataset/*.npz')
train_val_split = 8000
train_dataset = VimeoDataset(npzs[:train_val_split],train_set = True)
val_dataset = VimeoDataset(npzs[train_val_split:])
以下是部分代码,epoch=300,每个npz文件2.73M,但是我的dataloader的batch size给的是64,一共8gpuss,所以一个mini batch应该是64×8× 2.73M≈1.1G,我的实际内存是128G。即使解压后变大,也达不到128G的大小。下图link表示128G内存全部被占用。我应该如何更改我的代码?
class VimeoDataset(Dataset):
def __init__(self, dataset_name, batch_size=64):
def __init__(self, dataset_name, batch_size=32):
self.batch_size = batch_size
self.path = '/data/train_sample/dataset/'
self.dataset_name = dataset_name
#self.load_data()
self.h = 256
self.w = 448
xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
self.grid = np.stack((xx,yy),2).copy()
def __len__(self):
return len(self.meta_data)
def getimg(self, index):
f = np.load('/data/train_sample/dataset/'+ str(index) + '.npz')
if index < 8000:
train_data = f['i0i1gt']
flow_data = f['ft0ft1']
elif 8000 <= index < 10000:
val_data = f['i0i1gt']
else:
pass
if self.dataset_name == 'train':
meta_data = train_data
else:
meta_data = val_data
data = meta_data
img0 = data[0:3].transpose(1, 2, 0)
img1 = data[3:6].transpose(1, 2, 0)
gt = data[6:9].transpose(1, 2, 0)
flow_gt = flow_data.transpose(1, 2, 0)
return img0, gt, img1, flow_gt
dataset = VimeoDataset('train')
def __getitem__(self, index):
img0, gt, img1, flow_gt = self.getimg(index)
...
sampler = DistributedSampler(dataset)
train_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=8, pin_memory=True,
drop_last=True, sampler=sampler)
system usage figure
我已经根据我们上面的评论修复了你的数据集。本质上,您需要将更多变量传递到 class 中,以便它可以轻松区分您的训练数据和验证数据。这不会将所有数据加载到内存中,尽管有时这是必要的(顺序地,而不是一次)计算一些数据统计数据等。
免责声明:我猜测使用 glob 查找您的 npz 文件,并且您在验证集中使用 flow_data(验证数据的代码中缺少)。
from glob import glob
class VimeoDataset(Dataset):
def __init__(self, npzs, batch_size=64,train_set=False):
self.batch_size = batch_size
self.train_set = train_set
self.h = 256
self.w = 448
xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
self.grid = np.stack((xx,yy),2).copy()
self.npzs = npzs
def __len__(self):
return len(self.npzs)
def getimg(self, index):
f = np.load(self.npzs[index])
data = f['i0i1gt']
if self.train_set:
flow_data = f['ft0ft1']
else:
flow_data = np.zeros([self.h,self.w,4])
img0 = data[0:3].transpose(1, 2, 0)
img1 = data[3:6].transpose(1, 2, 0)
gt = data[6:9].transpose(1, 2, 0)
flow_gt = flow_data.transpose(1, 2, 0)
return img0, gt, img1, flow_gt
def __getitem__(self, index):
img0, gt, img1, flow_gt = self.getimg(index)
npzs = glob('/data/train_sample/dataset/*.npz')
train_val_split = 8000
train_dataset = VimeoDataset(npzs[:train_val_split],train_set = True)
val_dataset = VimeoDataset(npzs[train_val_split:])