在没有 Torchsample 的情况下在 Pytorch 中实现提前停止

Implementing Early Stopping in Pytorch without Torchsample

作为一个 Pytorch 新手(来自 tensorflow),我不确定如何实现 Early Stopping。我的研究让我发现 pytorch 没有原生的方法来做到这一点。我还发现了 torchsample,但由于某种原因无法将其安装在我的 conda 环境中。有没有一种简单的方法可以在没有它的情况下应用提前停止?这是我当前的设置:

class RegressionDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
    
    def __len__(self):
        return len(self.X_data)

train_dataset = RegressionDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
val_dataset = RegressionDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
test_dataset = RegressionDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())

# Model Params
EPOCHS = 100
BATCH_SIZE = 1000
LEARNING_RATE = 0.001
NUM_FEATURES = np.shape(X_test)[1]

# Initialize Dataloader
train_loader = DataLoader(dataset = train_dataset, batch_size=BATCH_SIZE, shuffle = True)
val_loader = DataLoader(dataset = val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(dataset = test_dataset, batch_size=BATCH_SIZE)

# Define Neural Network Architecture
class MultipleRegression(nn.Module):
    def __init__(self, num_features):
        super(MultipleRegression, self).__init__()
        
        # Define architecture
        self.layer_1 = nn.Linear(num_features, 16)
        self.layer_2 = nn.Linear(16, 32)
        self.layer_3 = nn.Linear(32, 25)
        self.layer_4 = nn.Linear(25, 20)
        self.layer_5 = nn.Linear(20, 16)
        self.layer_out = nn.Linear(16, 1)
        
        self.relu = nn.ReLU() # ReLU applied to all layers
        
        # Initialize weights and biases
        nn.init.xavier_uniform_(self.layer_1.weight)
        nn.init.zeros_(self.layer_1.bias)
        nn.init.xavier_uniform_(self.layer_2.weight)
        nn.init.zeros_(self.layer_2.bias)
        nn.init.xavier_uniform_(self.layer_3.weight)
        nn.init.zeros_(self.layer_3.bias)
        nn.init.xavier_uniform_(self.layer_4.weight)
        nn.init.zeros_(self.layer_4.bias)
        nn.init.xavier_uniform_(self.layer_5.weight)
        nn.init.zeros_(self.layer_5.bias)
        nn.init.xavier_uniform_(self.layer_out.weight)
        nn.init.zeros_(self.layer_out.bias)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.relu(self.layer_2(x))
        x = self.relu(self.layer_3(x))
        x = self.relu(self.layer_4(x))
        x = self.relu(self.layer_5(x))
        x = self.layer_out(x)
        return(x)
    
    def predict(self, test_inputs):
        x = self.relu(self.layer_1(test_inputs))
        x = self.relu(self.layer_2(x))
        x = self.relu(self.layer_3(x))
        x = self.relu(self.layer_4(x))
        x = self.relu(self.layer_5(x))
        x = self.layer_out(x)
        return(x)

# Check for GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model = MultipleRegression(NUM_FEATURES)
model.to(device)
print(model)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

# define dictionary to store loss/epochs for training and validation
loss_stats = {
    "train": [],
    "val": []
    }

# begin training
print("Begin Training")
for e in tqdm(range(1, EPOCHS+1)):
    # Training
    train_epoch_loss = 0
    model.train()

    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        
        y_train_pred = model(X_train_batch)
        
        train_loss = criterion(y_train_pred, y_train_batch.unsqueeze(1))
        
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        
        # validation
        with torch.no_grad():
            
            val_epoch_loss = 0
            
            model.eval()
            for X_val_batch, y_val_batch in val_loader:
                X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
                
                y_val_pred = model(X_val_batch)
                
                val_loss = criterion(y_val_pred, y_val_batch.unsqueeze(1))
                
                val_epoch_loss += val_loss.item()
    loss_stats["train"].append(train_epoch_loss/len(train_loader))
    loss_stats["val"].append(val_epoch_loss/len(val_loader))
    print(f"Epoch {e}: \ Train loss: {train_epoch_loss/len(train_loader):.5f} \ Val loss: {val_epoch_loss/len(val_loader):.5f}")

# Visualize loss and accuracy
train_val_loss_df = pd.DataFrame.from_dict(loss_stats).reset_index().melt(id_vars=["index"]).rename(columns = {"index":"epochs"})
plt.figure()
sns.lineplot(data = train_val_loss_df, x = "epochs", y = "value", hue = "variable").set_title("Train-Val Loss/Epoch")

# Test model
y_pred_list = []
with torch.no_grad(): 
    model.eval()
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_pred_list.append(y_test_pred.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

y_pred_list = [item for sublist in y_pred_list for item in sublist]
y_pred_list = np.array(y_pred_list)

mse = mean_squared_error(y_test, y_pred_list)
r_square = r2_score(y_test, y_pred_list)
print("Mean Squared Error :", mse)
print("R^2 :", r_square)

执行此操作的基本方法是跟踪迄今为止获得的最佳验证损失。

您可以在循环历元之前初始化一个变量 best_loss = 0(或者您可以做其他事情,例如每个历元的最佳损失等)。

每次验证通过后执行:

if val_loss > best_loss:
    best_loss = val_loss
    # At this point also save a snapshot of the current model
    torch.save(model, 'my_model_best_loss.pth')

然后,如果 best_loss 在一些训练步骤后没有显着改善,或者到 epoch 结束时,或者 val_loss 变得更糟,则跳出循环并在那里结束训练。

为了实现早停(以及一般的训练循环)等算法,您可能会发现尝试 PyTorch Lightning 更容易(没有从属关系,但比尝试手动滚动所有内容要容易得多)。