非序列模型的 Pytorch 梯度问题
Problem with Pytorch gradient of a non-sequential model
我无法重现这个 Pytorch tutorial。
他们介绍的模型是:
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
return output, hidden
def initHidden(self, batch_size):
return torch.zeros(batch_size, self.hidden_size, dtype=torch.float32, requires_grad=True)
此模型再现了 RNN 单元内部发生的事情。
编码时,我遇到了模型内部梯度的问题。
重现问题的代码如下:
import torch
import torch.nn as nn
# Toy data to reproduce the issue
toy_data_batch = torch.tensor([[0, 1], [1, 0], [1, 0]], dtype=torch.float32)
toy_label_batch = torch.tensor([2, 0, 3], dtype=torch.long)
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
return output, hidden
def initHidden(self, batch_size):
return torch.zeros(batch_size, self.hidden_size, dtype=torch.float32, requires_grad=True)
# Model initialization
input_size = 2
hidden_size = 2
output_size = 4 # Targets in {0, 1, 2, 3}
batch_size = 3 # 3 data points in the batch
learning_rate = 5e-3
rnn = RNN(input_size, hidden_size, output_size)
hidden = rnn.initHidden(batch_size) # init hidden layer with zeros
# Negative log likelihood as it is classification
criterion = nn.NLLLoss()
# Forward pass
output, hidden = rnn(toy_data_batch, hidden)
#output, hidden = rnn(toy_data_batch, hidden) ### BUG: if I remove the comment here, It works
# Loss computation
loss = criterion(output, toy_label_batch)
# Backward pass
loss.backward()
print(rnn.i2o.weight.grad) # This one is fine
print(rnn.i2h.weight.grad) # This one isn't (has type None)
# This will fail, because of the None gradient
for weight in rnn.parameters():
weight.data.add_(weight.grad.data, alpha=-learning_rate)
输出为:
tensor([[-0.1892, 0.0462, 0.0000, 0.0000],
[ 0.1274, 0.1133, 0.0000, 0.0000],
[ 0.1455, -0.2525, 0.0000, 0.0000],
[-0.0837, 0.0930, 0.0000, 0.0000]])
None
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-3-5f02113fddf6> in <module>
54 # This will fail, because of the None gradient
55 for weight in rnn.parameters():
---> 56 weight.data.add_(weight.grad.data, alpha=-learning_rate)
AttributeError: 'NoneType' object has no attribute 'data'
我注意到如果我取消注释该行
output, hidden = rnn(toy_data_batch, hidden)
#output, hidden = rnn(toy_data_batch, hidden) ### BUG: if I remove the comment here, It works
一切正常。在我看来,隐藏变量的初始化有问题。因为我已经打开参数'requires_grad',我不知道该怎么做。
提前致谢,如有任何帮助,我们将不胜感激
self.i2h
没有梯度,因为它没有在模型的第一步中使用。当你反向传播时,你的模型在第一阶段只使用 self.i2o
,所以 self.i2h
对输出没有影响。然而,当你进入第二步时,它使用了一个 hidden
,它是使用 self.i2h
计算的,因此在该层中有一个可追踪的梯度。
我无法重现这个 Pytorch tutorial。
他们介绍的模型是:
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
return output, hidden
def initHidden(self, batch_size):
return torch.zeros(batch_size, self.hidden_size, dtype=torch.float32, requires_grad=True)
此模型再现了 RNN 单元内部发生的事情。
编码时,我遇到了模型内部梯度的问题。
重现问题的代码如下:
import torch
import torch.nn as nn
# Toy data to reproduce the issue
toy_data_batch = torch.tensor([[0, 1], [1, 0], [1, 0]], dtype=torch.float32)
toy_label_batch = torch.tensor([2, 0, 3], dtype=torch.long)
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
return output, hidden
def initHidden(self, batch_size):
return torch.zeros(batch_size, self.hidden_size, dtype=torch.float32, requires_grad=True)
# Model initialization
input_size = 2
hidden_size = 2
output_size = 4 # Targets in {0, 1, 2, 3}
batch_size = 3 # 3 data points in the batch
learning_rate = 5e-3
rnn = RNN(input_size, hidden_size, output_size)
hidden = rnn.initHidden(batch_size) # init hidden layer with zeros
# Negative log likelihood as it is classification
criterion = nn.NLLLoss()
# Forward pass
output, hidden = rnn(toy_data_batch, hidden)
#output, hidden = rnn(toy_data_batch, hidden) ### BUG: if I remove the comment here, It works
# Loss computation
loss = criterion(output, toy_label_batch)
# Backward pass
loss.backward()
print(rnn.i2o.weight.grad) # This one is fine
print(rnn.i2h.weight.grad) # This one isn't (has type None)
# This will fail, because of the None gradient
for weight in rnn.parameters():
weight.data.add_(weight.grad.data, alpha=-learning_rate)
输出为:
tensor([[-0.1892, 0.0462, 0.0000, 0.0000],
[ 0.1274, 0.1133, 0.0000, 0.0000],
[ 0.1455, -0.2525, 0.0000, 0.0000],
[-0.0837, 0.0930, 0.0000, 0.0000]])
None
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-3-5f02113fddf6> in <module>
54 # This will fail, because of the None gradient
55 for weight in rnn.parameters():
---> 56 weight.data.add_(weight.grad.data, alpha=-learning_rate)
AttributeError: 'NoneType' object has no attribute 'data'
我注意到如果我取消注释该行
output, hidden = rnn(toy_data_batch, hidden)
#output, hidden = rnn(toy_data_batch, hidden) ### BUG: if I remove the comment here, It works
一切正常。在我看来,隐藏变量的初始化有问题。因为我已经打开参数'requires_grad',我不知道该怎么做。
提前致谢,如有任何帮助,我们将不胜感激
self.i2h
没有梯度,因为它没有在模型的第一步中使用。当你反向传播时,你的模型在第一阶段只使用 self.i2o
,所以 self.i2h
对输出没有影响。然而,当你进入第二步时,它使用了一个 hidden
,它是使用 self.i2h
计算的,因此在该层中有一个可追踪的梯度。