PyTorch LSTM 具有用于 MSELoss 的 nan
PyTorch LSTM has nan for MSELoss
我的模型是:
class BaselineModel(nn.Module):
def __init__(self, feature_dim=5, hidden_size=5, num_layers=2, batch_size=32):
super(BaselineModel, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size=feature_dim,
hidden_size=hidden_size, num_layers=num_layers)
def forward(self, x, hidden):
lstm_out, hidden = self.lstm(x, hidden)
return lstm_out, hidden
def init_hidden(self, batch_size):
hidden = Variable(next(self.parameters()).data.new(
self.num_layers, batch_size, self.hidden_size))
cell = Variable(next(self.parameters()).data.new(
self.num_layers, batch_size, self.hidden_size))
return (hidden, cell)
训练看起来像:
train_loader = torch.utils.data.DataLoader(
train_set, batch_size=BATCH_SIZE, shuffle=True, **params)
model = BaselineModel(batch_size=BATCH_SIZE)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
loss_fn = torch.nn.MSELoss(reduction='sum')
for epoch in range(250):
# hidden = (torch.zeros(2, 13, 5),
# torch.zeros(2, 13, 5))
# model.hidden = hidden
for i, data in enumerate(train_loader):
hidden = model.init_hidden(13)
inputs = data[0]
outputs = data[1]
print('inputs', inputs.size())
# print('outputs', outputs.size())
# optimizer.zero_grad()
model.zero_grad()
# print('inputs', inputs)
pred, hidden = model(inputs, hidden)
loss = loss_fn(pred, outputs)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
print('Epoch: ', epoch, '\ti: ', i, '\tLoss: ', loss)
我已经设置了渐变裁剪,这似乎是推荐的解决方案。但即使是第一步,我也会得到:
Epoch: 0 i: 0 Loss: tensor(nan, grad_fn=)
我怀疑你的问题与你的输出有关/data[1]
(如果你展示你的 train_set 的例子会有所帮助)。 运行 下面的代码没有给出 nan,但我在调用 loss_fn(pred, outputs)
之前手动强制输出形状:
class BaselineModel(nn.Module):
def __init__(self, feature_dim=5, hidden_size=5, num_layers=2, batch_size=32):
super(BaselineModel, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size=feature_dim,
hidden_size=hidden_size, num_layers=num_layers)
def forward(self, x, hidden):
lstm_out, hidden = self.lstm(x, hidden)
return lstm_out, hidden
def init_hidden(self, batch_size):
hidden = Variable(next(self.parameters()).data.new(
self.num_layers, batch_size, self.hidden_size))
cell = Variable(next(self.parameters()).data.new(
self.num_layers, batch_size, self.hidden_size))
return (hidden, cell)
model = BaselineModel(batch_size=32)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
loss_fn = torch.nn.MSELoss(reduction='sum')
hidden = model.init_hidden(10)
model.zero_grad()
pred, hidden = model(torch.randn(2,10,5), hidden)
pred.size() #torch.Size([2, 10, 5])
outputs = torch.zeros(2,10,5)
loss = loss_fn(pred, outputs)
loss
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
print(loss)
请注意,nan 值的一个常见原因可能与您学习阶段的数值稳定性有关,但通常在您看到分歧发生之前,您已经有了第一步的值,这显然不是这里的情况。
我的模型是:
class BaselineModel(nn.Module):
def __init__(self, feature_dim=5, hidden_size=5, num_layers=2, batch_size=32):
super(BaselineModel, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size=feature_dim,
hidden_size=hidden_size, num_layers=num_layers)
def forward(self, x, hidden):
lstm_out, hidden = self.lstm(x, hidden)
return lstm_out, hidden
def init_hidden(self, batch_size):
hidden = Variable(next(self.parameters()).data.new(
self.num_layers, batch_size, self.hidden_size))
cell = Variable(next(self.parameters()).data.new(
self.num_layers, batch_size, self.hidden_size))
return (hidden, cell)
训练看起来像:
train_loader = torch.utils.data.DataLoader(
train_set, batch_size=BATCH_SIZE, shuffle=True, **params)
model = BaselineModel(batch_size=BATCH_SIZE)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
loss_fn = torch.nn.MSELoss(reduction='sum')
for epoch in range(250):
# hidden = (torch.zeros(2, 13, 5),
# torch.zeros(2, 13, 5))
# model.hidden = hidden
for i, data in enumerate(train_loader):
hidden = model.init_hidden(13)
inputs = data[0]
outputs = data[1]
print('inputs', inputs.size())
# print('outputs', outputs.size())
# optimizer.zero_grad()
model.zero_grad()
# print('inputs', inputs)
pred, hidden = model(inputs, hidden)
loss = loss_fn(pred, outputs)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
print('Epoch: ', epoch, '\ti: ', i, '\tLoss: ', loss)
我已经设置了渐变裁剪,这似乎是推荐的解决方案。但即使是第一步,我也会得到:
Epoch: 0 i: 0 Loss: tensor(nan, grad_fn=)
我怀疑你的问题与你的输出有关/data[1]
(如果你展示你的 train_set 的例子会有所帮助)。 运行 下面的代码没有给出 nan,但我在调用 loss_fn(pred, outputs)
之前手动强制输出形状:
class BaselineModel(nn.Module):
def __init__(self, feature_dim=5, hidden_size=5, num_layers=2, batch_size=32):
super(BaselineModel, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size=feature_dim,
hidden_size=hidden_size, num_layers=num_layers)
def forward(self, x, hidden):
lstm_out, hidden = self.lstm(x, hidden)
return lstm_out, hidden
def init_hidden(self, batch_size):
hidden = Variable(next(self.parameters()).data.new(
self.num_layers, batch_size, self.hidden_size))
cell = Variable(next(self.parameters()).data.new(
self.num_layers, batch_size, self.hidden_size))
return (hidden, cell)
model = BaselineModel(batch_size=32)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
loss_fn = torch.nn.MSELoss(reduction='sum')
hidden = model.init_hidden(10)
model.zero_grad()
pred, hidden = model(torch.randn(2,10,5), hidden)
pred.size() #torch.Size([2, 10, 5])
outputs = torch.zeros(2,10,5)
loss = loss_fn(pred, outputs)
loss
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
print(loss)
请注意,nan 值的一个常见原因可能与您学习阶段的数值稳定性有关,但通常在您看到分歧发生之前,您已经有了第一步的值,这显然不是这里的情况。