PyTorch RuntimeError : Gradients are not CUDA tensors
PyTorch RuntimeError : Gradients are not CUDA tensors
在对字符执行 seq to seq 并馈送到 LSTM 并使用注意力解码单词时,我收到以下错误。前向传播很好,但在计算 loss.backward() 时出现以下错误。
RuntimeError: Gradients aren't CUDA tensors
我的train()函数如下。
def train(input_batch, input_batch_length, target_batch, target_batch_length, batch_size):
# Zero gradients of both optimizers
encoderchar_optimizer.zero_grad()
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
encoder_input = Variable(torch.FloatTensor(len(input_batch), batch_size, 500))
for ix , w in enumerate(input_batch):
w = w.contiguous().view(15, batch_size)
reshaped_input_length = [x[ix] for x in input_batch_length] # [15 ,.. 30 times] * 128
if USE_CUDA:
w = w.cuda()
#reshaped_input_length = Variable(torch.LongTensor(reshaped_input_length)).cuda()
hidden_all , output = encoderchar(w, reshaped_input_length)
encoder_input[ix] = output.transpose(0,1).contiguous().view(batch_size, -1)
if USE_CUDA:
encoder_input = encoder_input.cuda()
temporary_target_batch_length = [15] * batch_size
encoder_hidden_all, encoder_output = encoder(encoder_input, target_batch_length)
decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))
decoder_hidden = encoder_output
max_target_length = max(temporary_target_batch_length)
all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))
# Move new Variables to CUDA
if USE_CUDA:
decoder_input = decoder_input.cuda()
all_decoder_outputs = all_decoder_outputs.cuda()
target_batch = target_batch.cuda()
# Run through decoder one time step at a time
for t in range(max_target_length):
decoder_output, decoder_hidden, decoder_attn = decoder(
decoder_input, decoder_hidden, encoder_hidden_all
)
all_decoder_outputs[t] = decoder_output
decoder_input = target_batch[t] # Next input is current target
if USE_CUDA:
decoder_input = decoder_input.cuda()
# Loss calculation and backpropagation
loss = masked_cross_entropy(
all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
target_batch.transpose(0, 1).contiguous(), # -> batch x seq
target_batch_length
)
loss.backward()
# Clip gradient norms
ecc = torch.nn.utils.clip_grad_norm(encoderchar.parameters(), clip)
ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
# Update parameters with optimizers
encoderchar_optimizer.step()
encoder_optimizer.step()
decoder_optimizer.step()
return loss.data[0], ec, dc
这里有完整的堆栈跟踪。
RuntimeError Traceback (most recent call last)
<ipython-input-10-9778e12ded02> in <module>()
11 data_target_batch_index= Variable(torch.LongTensor(data_target_batch_index)).transpose(0,1)
12 # Send the data for training
---> 13 loss, ar1, ar2 = train(data_input_batch_index, data_input_batch_length, data_target_batch_index, data_target_batch_length, batch_size)
14
15 # Keep track of loss
<ipython-input-8-9c71c385f8cd> in train(input_batch, input_batch_length, target_batch, target_batch_length, batch_size)
54 target_batch_length
55 )
---> 56 loss.backward()
57
58 # Clip gradient norms
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/autograd/variable.py in backward(self, gradient, retain_variables)
144 'or with gradient w.r.t. the variable')
145 gradient = self.data.new().resize_as_(self.data).fill_(1)
--> 146 self._execution_engine.run_backward((self,), (gradient,), retain_variables)
147
148 def register_hook(self, hook):
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/autograd/function.py in _do_backward(self, gradients, retain_variables)
207 def _do_backward(self, gradients, retain_variables):
208 self.retain_variables = retain_variables
--> 209 result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
210 if not retain_variables:
211 del self._nested_output
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/autograd/function.py in backward(self, *gradients)
215 def backward(self, *gradients):
216 nested_gradients = _unflatten(gradients, self._nested_output)
--> 217 result = self.backward_extended(*nested_gradients)
218 return tuple(_iter_None_tensors(result))
219
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in backward_extended(self, grad_output, grad_hy)
314 grad_hy,
315 grad_input,
--> 316 grad_hx)
317
318 if any(self.needs_input_grad[1:]):
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx)
371 hidden_size, dcy.size()))
372 if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda):
--> 373 raise RuntimeError('Gradients aren\'t CUDA tensors')
374
375 check_error(cudnn.lib.cudnnRNNBackwardData(
RuntimeError: Gradients aren't CUDA tensors
关于我做错的原因有什么建议吗?
确保所有继承 nn.Module
的对象也调用它们的 .cuda()
。确保在将任何张量传递给他们之前先打电话。 (基本上在训练之前)
例如,(我猜你的 encoder
和 decoder
就是这样的对象),在调用 train()
.
之前执行此操作
encoder = encoder.cuda()
decoder = decoder.cuda()
这确保模型的所有参数都在 cuda 内存中初始化。
编辑
一般来说,每当你遇到这种错误时,
RuntimeError: Gradients aren't CUDA tensors
在某个地方,(从您的模型创建,到定义输入,再到最终将输出提供给损失函数)您错过了在 GPU 内存中指定一个 Variable
对象。您将完成模型中的每一步,验证所有 Variable
个对象是否在 GPU 内存中。
此外,您不必在输出上调用 .cuda()
。鉴于输入在 gpu 的内存中,所有操作也发生在 gpu 的内存中,您的输出也是如此。
在对字符执行 seq to seq 并馈送到 LSTM 并使用注意力解码单词时,我收到以下错误。前向传播很好,但在计算 loss.backward() 时出现以下错误。
RuntimeError: Gradients aren't CUDA tensors
我的train()函数如下。
def train(input_batch, input_batch_length, target_batch, target_batch_length, batch_size):
# Zero gradients of both optimizers
encoderchar_optimizer.zero_grad()
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
encoder_input = Variable(torch.FloatTensor(len(input_batch), batch_size, 500))
for ix , w in enumerate(input_batch):
w = w.contiguous().view(15, batch_size)
reshaped_input_length = [x[ix] for x in input_batch_length] # [15 ,.. 30 times] * 128
if USE_CUDA:
w = w.cuda()
#reshaped_input_length = Variable(torch.LongTensor(reshaped_input_length)).cuda()
hidden_all , output = encoderchar(w, reshaped_input_length)
encoder_input[ix] = output.transpose(0,1).contiguous().view(batch_size, -1)
if USE_CUDA:
encoder_input = encoder_input.cuda()
temporary_target_batch_length = [15] * batch_size
encoder_hidden_all, encoder_output = encoder(encoder_input, target_batch_length)
decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))
decoder_hidden = encoder_output
max_target_length = max(temporary_target_batch_length)
all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))
# Move new Variables to CUDA
if USE_CUDA:
decoder_input = decoder_input.cuda()
all_decoder_outputs = all_decoder_outputs.cuda()
target_batch = target_batch.cuda()
# Run through decoder one time step at a time
for t in range(max_target_length):
decoder_output, decoder_hidden, decoder_attn = decoder(
decoder_input, decoder_hidden, encoder_hidden_all
)
all_decoder_outputs[t] = decoder_output
decoder_input = target_batch[t] # Next input is current target
if USE_CUDA:
decoder_input = decoder_input.cuda()
# Loss calculation and backpropagation
loss = masked_cross_entropy(
all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
target_batch.transpose(0, 1).contiguous(), # -> batch x seq
target_batch_length
)
loss.backward()
# Clip gradient norms
ecc = torch.nn.utils.clip_grad_norm(encoderchar.parameters(), clip)
ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
# Update parameters with optimizers
encoderchar_optimizer.step()
encoder_optimizer.step()
decoder_optimizer.step()
return loss.data[0], ec, dc
这里有完整的堆栈跟踪。
RuntimeError Traceback (most recent call last)
<ipython-input-10-9778e12ded02> in <module>()
11 data_target_batch_index= Variable(torch.LongTensor(data_target_batch_index)).transpose(0,1)
12 # Send the data for training
---> 13 loss, ar1, ar2 = train(data_input_batch_index, data_input_batch_length, data_target_batch_index, data_target_batch_length, batch_size)
14
15 # Keep track of loss
<ipython-input-8-9c71c385f8cd> in train(input_batch, input_batch_length, target_batch, target_batch_length, batch_size)
54 target_batch_length
55 )
---> 56 loss.backward()
57
58 # Clip gradient norms
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/autograd/variable.py in backward(self, gradient, retain_variables)
144 'or with gradient w.r.t. the variable')
145 gradient = self.data.new().resize_as_(self.data).fill_(1)
--> 146 self._execution_engine.run_backward((self,), (gradient,), retain_variables)
147
148 def register_hook(self, hook):
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/autograd/function.py in _do_backward(self, gradients, retain_variables)
207 def _do_backward(self, gradients, retain_variables):
208 self.retain_variables = retain_variables
--> 209 result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
210 if not retain_variables:
211 del self._nested_output
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/autograd/function.py in backward(self, *gradients)
215 def backward(self, *gradients):
216 nested_gradients = _unflatten(gradients, self._nested_output)
--> 217 result = self.backward_extended(*nested_gradients)
218 return tuple(_iter_None_tensors(result))
219
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in backward_extended(self, grad_output, grad_hy)
314 grad_hy,
315 grad_input,
--> 316 grad_hx)
317
318 if any(self.needs_input_grad[1:]):
/home/ubuntu/anaconda3/envs/tensorflow/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx)
371 hidden_size, dcy.size()))
372 if not dhy.is_cuda or not dy.is_cuda or (dcy is not None and not dcy.is_cuda):
--> 373 raise RuntimeError('Gradients aren\'t CUDA tensors')
374
375 check_error(cudnn.lib.cudnnRNNBackwardData(
RuntimeError: Gradients aren't CUDA tensors
关于我做错的原因有什么建议吗?
确保所有继承 nn.Module
的对象也调用它们的 .cuda()
。确保在将任何张量传递给他们之前先打电话。 (基本上在训练之前)
例如,(我猜你的 encoder
和 decoder
就是这样的对象),在调用 train()
.
encoder = encoder.cuda()
decoder = decoder.cuda()
这确保模型的所有参数都在 cuda 内存中初始化。
编辑
一般来说,每当你遇到这种错误时,
RuntimeError: Gradients aren't CUDA tensors
在某个地方,(从您的模型创建,到定义输入,再到最终将输出提供给损失函数)您错过了在 GPU 内存中指定一个 Variable
对象。您将完成模型中的每一步,验证所有 Variable
个对象是否在 GPU 内存中。
此外,您不必在输出上调用 .cuda()
。鉴于输入在 gpu 的内存中,所有操作也发生在 gpu 的内存中,您的输出也是如此。