CNN 的训练损失在增加吗?
Training loss is increasing in CNN?
我正在训练我的第一个 CNN 来解决多 class class 化问题。我正在输入与 182 classes 之一相对应的动物图像,但是我有 运行 一些问题。首先,我的代码似乎卡在 optimiser.step() 上,它已经计算了大约 30 分钟。其次我的训练损失在增加:
EPOCH: 0 BATCH: 1999 LOSS: 1.5790680234357715
EPOCH: 0 BATCH: 3999 LOSS: 2.9340945997834207
如果有人能够提供一些指导,我们将不胜感激。下面是我的代码
#loading data
train_data = dataset.get_subset(
"train",
transform=transforms.Compose(
[transforms.Resize((448, 448)), transforms.ToTensor()]
),
)
train_loader = get_train_loader("standard", train_data, batch_size=16)
#definind model
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 3, 1)
self.conv2 = nn.Conv2d(6, 16, 3, 3)
self.fc1 = nn.Linear(37*37*16, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 182)
def forward(self, X):
X = F.relu(self.conv1(X))
X = F.max_pool2d(X, 2, 2)
X = F.relu(self.conv2(X))
X = F.max_pool2d(X, 2, 2)
X = torch.flatten(X, 1)
X = F.relu(self.fc1((X)))
X = F.relu(self.fc2((X)))
X = self.fc3(X)
return F.log_softmax(X, dim=1)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modell.parameters(), lr=0.001)
import time
start_time = time.time()
#VARIABLES (TRACKER)
epochs = 2
train_losses = []
test_losses = []
train_correct = []
test_correct = []
# FOR LOOP EPOCH
for i in range(epochs):
trn_corr = 0
tst_corr = 0
running_loss = 0.0
#TRAIN
for b, (X_train, Y_train, meta) in enumerate(train_loader):
b+=1 #batch starts at 1
#zero parameter gradients
optimizer.zero_grad()
# pass training to model as float (later compute loss)
output = modell(X_train.float())
#Calculate the loss of outputs with respect to ground truth values
loss = criterion(output, Y_train)
#Backpropagate the loss through the network
loss.backward()
#perform parameter update based on the current gradient
optimizer.step()
predicted = torch.max(output.data, 1)[1]
batch_corr = (predicted == Y_train).sum() # True (1) or False (0)
trn_corr += batch_corr
running_loss += loss.item()
if b%2000 == 1999:
print(f"EPOCH: {i} BATCH: {b} LOSS: {running_loss/2000}")
running_loss = 0.0
train_losses.append(loss)
train_correct.append(trn_corr)
至于损失,可能是模型的原因。该模型有一些改进的空间。只有 2 个卷积层不足以处理您的数据,而且只能扩展到 16 个通道。使用更多通道的更多卷积层。例如,具有 16、32、32、64、64 通道的 5 个卷积层。尝试不同数量的层和通道,看看哪个最好。此外,Adam 的良好学习率是 3e-4.To 更容易跟踪模型进度,我建议减少它打印损失的间隔,以便您可以更轻松地跟踪进度。
关于数据,每个 class 是否有足够的实例?它是否在 0 和 1 之间归一化?
我正在训练我的第一个 CNN 来解决多 class class 化问题。我正在输入与 182 classes 之一相对应的动物图像,但是我有 运行 一些问题。首先,我的代码似乎卡在 optimiser.step() 上,它已经计算了大约 30 分钟。其次我的训练损失在增加:
EPOCH: 0 BATCH: 1999 LOSS: 1.5790680234357715
EPOCH: 0 BATCH: 3999 LOSS: 2.9340945997834207
如果有人能够提供一些指导,我们将不胜感激。下面是我的代码
#loading data
train_data = dataset.get_subset(
"train",
transform=transforms.Compose(
[transforms.Resize((448, 448)), transforms.ToTensor()]
),
)
train_loader = get_train_loader("standard", train_data, batch_size=16)
#definind model
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 3, 1)
self.conv2 = nn.Conv2d(6, 16, 3, 3)
self.fc1 = nn.Linear(37*37*16, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 182)
def forward(self, X):
X = F.relu(self.conv1(X))
X = F.max_pool2d(X, 2, 2)
X = F.relu(self.conv2(X))
X = F.max_pool2d(X, 2, 2)
X = torch.flatten(X, 1)
X = F.relu(self.fc1((X)))
X = F.relu(self.fc2((X)))
X = self.fc3(X)
return F.log_softmax(X, dim=1)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modell.parameters(), lr=0.001)
import time
start_time = time.time()
#VARIABLES (TRACKER)
epochs = 2
train_losses = []
test_losses = []
train_correct = []
test_correct = []
# FOR LOOP EPOCH
for i in range(epochs):
trn_corr = 0
tst_corr = 0
running_loss = 0.0
#TRAIN
for b, (X_train, Y_train, meta) in enumerate(train_loader):
b+=1 #batch starts at 1
#zero parameter gradients
optimizer.zero_grad()
# pass training to model as float (later compute loss)
output = modell(X_train.float())
#Calculate the loss of outputs with respect to ground truth values
loss = criterion(output, Y_train)
#Backpropagate the loss through the network
loss.backward()
#perform parameter update based on the current gradient
optimizer.step()
predicted = torch.max(output.data, 1)[1]
batch_corr = (predicted == Y_train).sum() # True (1) or False (0)
trn_corr += batch_corr
running_loss += loss.item()
if b%2000 == 1999:
print(f"EPOCH: {i} BATCH: {b} LOSS: {running_loss/2000}")
running_loss = 0.0
train_losses.append(loss)
train_correct.append(trn_corr)
至于损失,可能是模型的原因。该模型有一些改进的空间。只有 2 个卷积层不足以处理您的数据,而且只能扩展到 16 个通道。使用更多通道的更多卷积层。例如,具有 16、32、32、64、64 通道的 5 个卷积层。尝试不同数量的层和通道,看看哪个最好。此外,Adam 的良好学习率是 3e-4.To 更容易跟踪模型进度,我建议减少它打印损失的间隔,以便您可以更轻松地跟踪进度。 关于数据,每个 class 是否有足够的实例?它是否在 0 和 1 之间归一化?