每批增加 GPU 内存(PyTorch)

GPU memory increasing at each batch (PyTorch)

我正在尝试使用 ConvLSTM 层(LSTM 单元但使用卷积而不是矩阵乘法)构建卷积网络,但问题是我的 GPU 内存在每批次都会增加,即使我正在删除变量,并且获得每次迭代的损失(而不是图形)的真实值。我可能做错了什么,但是完全相同的脚本 运行 与另一个模型没有问题(具有更多参数并且还使用了 ConvLSTM 层)。

每批由 num_batch x 3 张图像(灰度)组成,我正在尝试预测差异 |Im(t+1)-Im(t)|输入 Im(t)

def main():
    config = Config()

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, num_workers=0, shuffle=True, drop_last=True)
    
    nb_img = len(train_dataset)
    util.clear_progress_dir()

    step_tensorboard = 0
    ###################################
    #          Model Setup            #
    ###################################

    model = fully_convLSTM()
    if torch.cuda.is_available():   
        model = model.float().cuda()

    lr = 0.001
    optimizer = torch.optim.Adam(model.parameters(),lr=lr) 

    util.enumerate_params([model])

    ###################################
    #          Training Loop          #
    ###################################

    model.train() #Put model in training mode

    train_loss_recon = []
    train_loss_recon2 = []
    
    for epoch in tqdm(range(config.num_epochs)):
  
        running_loss1 = 0.0
        running_loss2 = 0.0

        for i, (inputs, outputs) in enumerate(train_dataloader, 0):
            print(i)
            torch.cuda.empty_cache()
            gc.collect()
           
           # if torch.cuda.is_available():
            inputs  = autograd.Variable(inputs.float()).cuda()
            outputs = autograd.Variable(outputs.float()).cuda()

            im1 =  inputs[:,0,:,:,:]
            im2 =  inputs[:,1,:,:,:]
            im3 =  inputs[:,2,:,:,:]
            
            diff1 = torch.abs(im2 - im1).cuda().float()
            diff2 = torch.abs(im3 - im2).cuda().float()

            model.initialize_hidden()
            
            optimizer.zero_grad()
            pred1 = model.forward(im1)  
            loss = reconstruction_loss(diff1, pred1)
            loss.backward()
            # optimizer.step()
           
            model.update_hidden()
            
            optimizer.zero_grad()
            pred2 = model.forward(im2)  
            loss2 = reconstruction_loss(diff2, pred2)   
            loss2.backward()   
            optimizer.step()

            model.update_hidden()

            ## print statistics
      
            running_loss1 += loss.detach().data
            running_loss2 += loss2.detach().data
            
            if i==0:

                with torch.no_grad():
                    img_grid_diff_true = (diff2).cpu()
                    img_grid_diff_pred = (pred2).cpu()
                    
                    f, axes = plt.subplots(2, 4, figsize=(48,48))
                    for l in range(4):
                        axes[0, l].imshow(img_grid_diff_true[l].squeeze(0).squeeze(0), cmap='gray')
                        axes[1, l].imshow(img_grid_diff_pred[l].squeeze(0).squeeze(0), cmap='gray')

                    plt.show()
                    plt.close()
           
                    writer_recon_loss.add_scalar('Reconstruction loss', running_loss1, step_tensorboard)
                    writer_recon_loss2.add_scalar('Reconstruction loss2', running_loss2, step_tensorboard)

                    step_tensorboard += 1
            
            del pred1
            del pred2
            del im1
            del im2
            del im3
            del diff1
            del diff2#, im1_noised, im2_noised
            del inputs
            del outputs
            del loss
            del loss2
            for obj in gc.get_objects():
                if torch.is_tensor(obj) :
                    del obj
        
            torch.cuda.empty_cache()
            gc.collect()
     
        epoch_loss = running_loss1 / len(train_dataloader.dataset)
        epoch_loss2 = running_loss2/ len(train_dataloader.dataset)
        print(f"Epoch {epoch} loss reconstruction1: {epoch_loss:.6f}")
        print(f"Epoch {epoch} loss reconstruction2: {epoch_loss2:.6f}")
        
        train_loss_recon.append(epoch_loss)
        train_loss_recon2.append(epoch_loss2)
        
        del running_loss1, running_loss2, epoch_loss, epoch_loss2

这里是使用的模型:

class ConvLSTMCell(nn.Module):
    def __init__(self, input_channels, hidden_channels, kernel_size):
        super(ConvLSTMCell, self).__init__()

        # assert hidden_channels % 2 == 0

        self.input_channels = input_channels
        self.hidden_channels = hidden_channels
        self.kernel_size = kernel_size
        # self.num_features = 4

        self.padding = 1

        self.Wxi = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
        self.Whi = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
        self.Wxf = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
        self.Whf = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
        self.Wxc = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
        self.Whc = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
        self.Wxo = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
        self.Who = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)

        self.Wci = None
        self.Wcf = None
        self.Wco = None

    def forward(self, x, h, c): ## Equation (3) dans Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting
        ci = torch.sigmoid(self.Wxi(x) + self.Whi(h) + c * self.Wci)
        cf = torch.sigmoid(self.Wxf(x) + self.Whf(h) + c * self.Wcf)
        cc = cf * c + ci * torch.tanh(self.Wxc(x) + self.Whc(h)) ###gt= tanh(cc)
        co = torch.sigmoid(self.Wxo(x) + self.Who(h) + cc * self.Wco) ##channel out = hidden channel
        ch = co * torch.tanh(cc)
        return ch, cc #short memory, long memory

    def init_hidden(self, batch_size, hidden, shape):
        if self.Wci is None:
            self.Wci = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
            self.Wcf = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
            self.Wco = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
        else:
            assert shape[0] == self.Wci.size()[2], 'Input Height Mismatched!'
            assert shape[1] == self.Wci.size()[3], 'Input Width Mismatched!'
        return (autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda(),
                autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda())


class fully_convLSTM(nn.Module):
    def __init__(self):
        super(fully_convLSTM, self).__init__()
        layers = []
        self.hidden_list = [1,32,32,1]#,32,64,32,
        for k in range(len(self.hidden_list)-1):   # Define blocks of [ConvLSTM,BatchNorm,Relu]
            name_conv = "self.convLSTM" +str(k)
            cell_conv = ConvLSTMCell(self.hidden_list[k],self.hidden_list[k+1],3)
            setattr(self, name_conv, cell_conv)
            name_batchnorm = "self.batchnorm"+str(k)
            batchnorm=nn.BatchNorm2d(self.hidden_list[k+1])
            setattr(self, name_batchnorm, batchnorm)
            name_relu =" self.relu"+str(k)
            relu=nn.ReLU()
            setattr(self, name_relu, relu)
        self.sigmoid = nn.Sigmoid()
    
        self.internal_state=[]
        
    def initialize_hidden(self):  
        for k in range(len(self.hidden_list)-1):   
            name_conv = "self.convLSTM" +str(k)             
            (h,c) = getattr(self,name_conv).init_hidden(config.batch_size, self.hidden_list[k+1],(256,256))
            self.internal_state.append((h,c))          
        self.internal_state_new=[]
    def update_hidden(self):
        for i, hidden in enumerate(self.internal_state_new):
            self.internal_state[i] = (hidden[0].detach(), hidden[1].detach())
        self.internal_state_new = []        
    def forward(self, input):
        x = input
        for k in range(len(self.hidden_list)-1):
            name_conv = "self.convLSTM" +str(k)
            name_batchnorm = "self.batchnorm"+str(k)
            name_relu =" self.relu"+str(k)
            x, c = getattr(self,name_conv)(x, self.internal_state[k][1], self.internal_state[k][0]) 
            self.internal_state_new.append((x.detach(),c.detach()))
            x = getattr(self,name_batchnorm)(x)
            if k!= len(self.hidden_list)-2:
                x = getattr(self,name_relu)(x)
            else :
                x = self.sigmoid(x)
        return x

所以我的问题是,我的代码中的什么导致训练阶段内存积累?

关于训练代码的一些快速说明:

  • torch.Variable 已弃用,因为至少有 8 个次要版本(参见 here),请勿使用它
  • gc.collect() 没有意义,PyTorch 自己做垃圾收集器
  • 不要为每个批次使用 torch.cuda.empty_cache(),因为 PyTorch 会保留一些 GPU 内存(不会将其还给 OS),因此它不必为每个批次分配它再次。它会使你的代码变慢,根本不要使用这个函数,PyTorch 会处理这个。
  • 不要垃圾邮件随机内存清理,这很可能不是错误所在

型号

是的,可能是这种情况(尽管很难阅读该模型的代码)。

注意 self.internal_state listself.internal_state_new list

  • 每次调用 model.initialize_hidden() 时,都会将一组新的张量添加到此列表中(据我所知从未清理过)
  • self.internal_state_new好像在update_hidden被清理了,也许self.internal_state应该也是?

本质上,请查看您模型的 self.internal_state 属性,根据我所见,该列表会无限增长。到处用 zeros 初始化很奇怪,可能没有必要这样做(例如 PyTorch 的 RNN 默认用 zeros 初始化,这可能是相似的)。