PyTorch 梯度对于 CUDA 和 CPU 具有不同的形状
PyTorch gradients have different shape for CUDA and CPU
我正在处理一个奇怪的问题,即反向传递后的梯度具有不同的形状,具体取决于使用的是 CUDA 还是 CPU。使用的模型比较简单:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool1 = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.pool2 = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
self.relu1 = nn.ReLU()
self.relu2 = nn.ReLU()
self.relu3 = nn.ReLU()
self.relu4 = nn.ReLU()
def forward(self, x):
x = self.pool1(self.relu1(self.conv1(x)))
x = self.pool2(self.relu2(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = self.relu3(self.fc1(x))
x = self.relu4(self.fc2(x))
x = self.fc3(x)
return x
输入张量的形状为(1, 3, 32, 32),相关代码如下,其中方法generate_gradients尤为重要:
class VanillaBackprop():
"""
Produces gradients generated with vanilla back propagation from the image
"""
def __init__(self, model):
self.model = model
self.gradients = None
# Put model in evaluation mode
self.model.eval()
# Hook the first layer to get the gradient
self.hook_layers()
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
def hook_layers(self):
def hook_function(module, grad_in, grad_out):
self.gradients = grad_in[0]
# Register hook to the first layer
try:
first_layer = list(self.model.features._modules.items())[0][1]
except:
first_layer = list(self.model._modules.items())[0][1]
first_layer.register_backward_hook(hook_function)
def generate_gradients(self, input_image, target_class):
# Forward
model_output = self.model(input_image.to(self.device))
# Zero grads
self.model.zero_grad()
# Target for backprop
one_hot_output = torch.FloatTensor(1, model_output.size()[-1]).zero_()
one_hot_output[0][target_class] = 1
# Backward pass
model_output.backward(gradient=one_hot_output.to(self.device))
# Convert Pytorch variable to numpy array
gradients_as_arr = self.gradients.data.cpu().numpy()[0]
return gradients_as_arr
在 CPU 上,self.gradients 具有形状 (1, 3, 32, 32),而在 CUDA 上它具有形状 (1, 6, 28, 28)。这怎么可能,我该如何解决?非常感谢任何帮助。
问题似乎源于 register_backward_hook()
函数。正如 the PyTorch forums 中指出的:
You might want to double check the register_backward_hook()
doc. But
it is known to be kind of broken at the moment and can have this
behavior.
I would recommend you use autograd.grad()
for this though. That will
make it simpler than backward+access to the .grad
field.
然而,我选择使用 register_hook()
而不是 register_backward_hook()
(与建议的 autograd.grad()
相反),这似乎也有效:
class VanillaBackprop():
"""
Produces gradients generated with vanilla back propagation from the image
"""
def __init__(self, model):
self.model = model
self.gradients = None
# Put model in evaluation mode
self.model.eval()
# Hook the first layer to get the gradient
def hook_input(self, input_tensor):
def hook_function(grad_in):
self.gradients = grad_in
input_tensor.register_hook(hook_function)
def generate_gradients(self, input_image, target_class):
# Register input hook
self.hook_input(input_image)
# Forward
model_output = self.model(input_image)
# Zero grads
self.model.zero_grad()
# Target for backprop
device = next(self.model.parameters()).device
one_hot_output = torch.FloatTensor(1, model_output.size()[-1]).zero_()
one_hot_output[0][target_class] = 1
one_hot_output = one_hot_output.to(device)
# Backward pass
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_output.backward(gradient=one_hot_output.to(device))
# Convert Pytorch variable to numpy array
# [0] to get rid of the first channel (1,3,224,224)
gradients_as_arr = self.gradients.data.cpu().numpy()[0]
return gradients_as_arr
我正在处理一个奇怪的问题,即反向传递后的梯度具有不同的形状,具体取决于使用的是 CUDA 还是 CPU。使用的模型比较简单:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool1 = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.pool2 = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
self.relu1 = nn.ReLU()
self.relu2 = nn.ReLU()
self.relu3 = nn.ReLU()
self.relu4 = nn.ReLU()
def forward(self, x):
x = self.pool1(self.relu1(self.conv1(x)))
x = self.pool2(self.relu2(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = self.relu3(self.fc1(x))
x = self.relu4(self.fc2(x))
x = self.fc3(x)
return x
输入张量的形状为(1, 3, 32, 32),相关代码如下,其中方法generate_gradients尤为重要:
class VanillaBackprop():
"""
Produces gradients generated with vanilla back propagation from the image
"""
def __init__(self, model):
self.model = model
self.gradients = None
# Put model in evaluation mode
self.model.eval()
# Hook the first layer to get the gradient
self.hook_layers()
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
def hook_layers(self):
def hook_function(module, grad_in, grad_out):
self.gradients = grad_in[0]
# Register hook to the first layer
try:
first_layer = list(self.model.features._modules.items())[0][1]
except:
first_layer = list(self.model._modules.items())[0][1]
first_layer.register_backward_hook(hook_function)
def generate_gradients(self, input_image, target_class):
# Forward
model_output = self.model(input_image.to(self.device))
# Zero grads
self.model.zero_grad()
# Target for backprop
one_hot_output = torch.FloatTensor(1, model_output.size()[-1]).zero_()
one_hot_output[0][target_class] = 1
# Backward pass
model_output.backward(gradient=one_hot_output.to(self.device))
# Convert Pytorch variable to numpy array
gradients_as_arr = self.gradients.data.cpu().numpy()[0]
return gradients_as_arr
在 CPU 上,self.gradients 具有形状 (1, 3, 32, 32),而在 CUDA 上它具有形状 (1, 6, 28, 28)。这怎么可能,我该如何解决?非常感谢任何帮助。
问题似乎源于 register_backward_hook()
函数。正如 the PyTorch forums 中指出的:
You might want to double check the
register_backward_hook()
doc. But it is known to be kind of broken at the moment and can have this behavior.I would recommend you use
autograd.grad()
for this though. That will make it simpler than backward+access to the.grad
field.
然而,我选择使用 register_hook()
而不是 register_backward_hook()
(与建议的 autograd.grad()
相反),这似乎也有效:
class VanillaBackprop():
"""
Produces gradients generated with vanilla back propagation from the image
"""
def __init__(self, model):
self.model = model
self.gradients = None
# Put model in evaluation mode
self.model.eval()
# Hook the first layer to get the gradient
def hook_input(self, input_tensor):
def hook_function(grad_in):
self.gradients = grad_in
input_tensor.register_hook(hook_function)
def generate_gradients(self, input_image, target_class):
# Register input hook
self.hook_input(input_image)
# Forward
model_output = self.model(input_image)
# Zero grads
self.model.zero_grad()
# Target for backprop
device = next(self.model.parameters()).device
one_hot_output = torch.FloatTensor(1, model_output.size()[-1]).zero_()
one_hot_output[0][target_class] = 1
one_hot_output = one_hot_output.to(device)
# Backward pass
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_output.backward(gradient=one_hot_output.to(device))
# Convert Pytorch variable to numpy array
# [0] to get rid of the first channel (1,3,224,224)
gradients_as_arr = self.gradients.data.cpu().numpy()[0]
return gradients_as_arr