GRU Loss 下降至 0.9 但没有进一步下降,PyTorch
GRU Loss decreased upto 0.9 but not further, PyTorch
我用来试验 GRU 的代码。
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import *
class N(nn.Module):
def __init__(self):
super().__init__()
self.embed = nn.Embedding(5,2)
self.layers = 4
self.gru = nn.GRU(2, 512, self.layers, batch_first=True)
self.bat = nn.BatchNorm1d(4)
self.bat1 = nn.BatchNorm1d(4)
self.bat2 = nn.BatchNorm1d(4)
self.fc = nn.Linear(512,100)
self.fc1 = nn.Linear(100,100)
self.fc2 = nn.Linear(100,5)
self.s = nn.Softmax(dim=-1)
def forward(self,x):
h0 = torch.zeros(self.layers, x.size(0), 512).requires_grad_()
x = self.embed(x)
x,hn = self.gru(x,h0)
x = self.bat(x)
x = self.fc(x)
x = nn.functional.relu(x)
x = self.bat1(x)
x = self.fc1(x)
x = nn.functional.relu(x)
x = self.bat2(x)
x = self.fc2(x)
softmaxed = self.s(x)
return softmaxed
inp = torch.tensor([[4,3,2,1],[2,3,4,1],[4,1,2,3],[1,2,3,4]])
out = torch.tensor([[3,2,1,4],[3,2,4,1],[1,2,3,4],[2,3,4,1]])
k = 0
n = N()
opt = torch.optim.Adam(n.parameters(),lr=0.0001)
while k<10000:
print(inp.shape)
o = n(inp)
o = o.view(-1, o.size(-1))
out = out.view(-1)
loss = nn.functional.cross_entropy(o.view(-1,o.size(-1)),out.view(-1)-1)
acc = ((torch.argmax(o, dim=1) == (out -1)).sum().item() / out.size(0))
if k==10000:
print(torch.argmax(o, dim=1))
print(out-1)
exit()
print(loss,acc)
loss.backward()
opt.step()
opt.zero_grad()
k+=1
print(o[0])
收缩输出:
torch.Size([4, 4])
tensor(0.9593, grad_fn=<NllLossBackward>) 0.9375
torch.Size([4, 4])
tensor(0.9593, grad_fn=<NllLossBackward>) 0.9375
tensor([4.8500e-01, 9.7813e-06, 5.1498e-01, 6.2428e-06, 7.5929e-06],
grad_fn=<SelectBackward>)
损失为0.9593,精度达到0.9375。对于这种简单的输入数据,GRU 损失就这么大。是什么原因?这段代码有什么问题吗?我使用 cross_entropy 作为损失函数,使用 Adam 作为优化器。学习率为 0.001。我尝试了多种学习率,但都给出了相同的最终结果。我添加了批量归一化,它加快了训练速度,但损失和准确性相同。为什么损失不会减少到 0.2 或其他东西。
我认为这是因为您使用的是交叉熵损失函数,它在 PyTorch 中结合了 log-softmax 和负对数似然。由于您的模型在返回输出之前已经执行了 softmax,因此您实际上最终计算了 softmax 的 softmax 的负对数似然。尝试从您的模型中删除最终的 softmax。
PyTorch 交叉熵损失文档:https://pytorch.org/docs/stable/nn.functional.html#cross-entropy
我用来试验 GRU 的代码。
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import *
class N(nn.Module):
def __init__(self):
super().__init__()
self.embed = nn.Embedding(5,2)
self.layers = 4
self.gru = nn.GRU(2, 512, self.layers, batch_first=True)
self.bat = nn.BatchNorm1d(4)
self.bat1 = nn.BatchNorm1d(4)
self.bat2 = nn.BatchNorm1d(4)
self.fc = nn.Linear(512,100)
self.fc1 = nn.Linear(100,100)
self.fc2 = nn.Linear(100,5)
self.s = nn.Softmax(dim=-1)
def forward(self,x):
h0 = torch.zeros(self.layers, x.size(0), 512).requires_grad_()
x = self.embed(x)
x,hn = self.gru(x,h0)
x = self.bat(x)
x = self.fc(x)
x = nn.functional.relu(x)
x = self.bat1(x)
x = self.fc1(x)
x = nn.functional.relu(x)
x = self.bat2(x)
x = self.fc2(x)
softmaxed = self.s(x)
return softmaxed
inp = torch.tensor([[4,3,2,1],[2,3,4,1],[4,1,2,3],[1,2,3,4]])
out = torch.tensor([[3,2,1,4],[3,2,4,1],[1,2,3,4],[2,3,4,1]])
k = 0
n = N()
opt = torch.optim.Adam(n.parameters(),lr=0.0001)
while k<10000:
print(inp.shape)
o = n(inp)
o = o.view(-1, o.size(-1))
out = out.view(-1)
loss = nn.functional.cross_entropy(o.view(-1,o.size(-1)),out.view(-1)-1)
acc = ((torch.argmax(o, dim=1) == (out -1)).sum().item() / out.size(0))
if k==10000:
print(torch.argmax(o, dim=1))
print(out-1)
exit()
print(loss,acc)
loss.backward()
opt.step()
opt.zero_grad()
k+=1
print(o[0])
收缩输出:
torch.Size([4, 4])
tensor(0.9593, grad_fn=<NllLossBackward>) 0.9375
torch.Size([4, 4])
tensor(0.9593, grad_fn=<NllLossBackward>) 0.9375
tensor([4.8500e-01, 9.7813e-06, 5.1498e-01, 6.2428e-06, 7.5929e-06],
grad_fn=<SelectBackward>)
损失为0.9593,精度达到0.9375。对于这种简单的输入数据,GRU 损失就这么大。是什么原因?这段代码有什么问题吗?我使用 cross_entropy 作为损失函数,使用 Adam 作为优化器。学习率为 0.001。我尝试了多种学习率,但都给出了相同的最终结果。我添加了批量归一化,它加快了训练速度,但损失和准确性相同。为什么损失不会减少到 0.2 或其他东西。
我认为这是因为您使用的是交叉熵损失函数,它在 PyTorch 中结合了 log-softmax 和负对数似然。由于您的模型在返回输出之前已经执行了 softmax,因此您实际上最终计算了 softmax 的 softmax 的负对数似然。尝试从您的模型中删除最终的 softmax。
PyTorch 交叉熵损失文档:https://pytorch.org/docs/stable/nn.functional.html#cross-entropy