tensorflow2.0 中没有为任何变量提供梯度

No gradients provided for any variable in tensorflow2.0

当我尝试使用 tensorflow2.0 根据 TensorFlow 发布的官方指南创建一个转换器时,我遇到了一个问题,当我添加一个完全连接的网络时,似乎这两个分类损失和翻译损失作为一些变量的梯度。

但是一旦我尝试将两个损失相加,所有变量的梯度就会消失。我不知道,我试图用数周的时间来解决这个问题。谁能给我一些建议?

@tf.function(input_signature=train_step_signature)
def train_step(group, inp, tar, label):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]  # sess=tf.compat.v1.Session()
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
    with tf.GradientTape(persistent=True) as tape:
        classfication, predictions, _ = transformer(inp, tar_inp,
                                                    True,
                                                    enc_padding_mask,
                                                    combined_mask,
                                                    dec_padding_mask)
        loss = loss_function(tar_real, predictions)
        loss2 = tf.nn.softmax_cross_entropy_with_logits(label, classfication)

    #print(loss,loss2)
    a=tape.gradient(loss,trainsformer.trainable_variable)
    gradients = tape.gradient(loss+loss2, transformer.trainable_variables)

    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    class_loss(loss2)
    train_loss(loss)
    train_accuracy(tar_real, predictions)

下面是我的错误信息

    ValueError                                Traceback (most recent call last)
<ipython-input-2-81054f0385cb> in <module>()
    999     # inp -> portuguese, tar -> english
   1000     for (batch, (group, inp, tar, label)) in enumerate(train_dataset):
-> 1001         train_step(group, inp, tar, label)
   1002         if batch % 50 == 0:
   1003             print(

8 frames
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
    455 
    456     tracing_count = self._get_tracing_count()
--> 457     result = self._call(*args, **kwds)
    458     if tracing_count == self._get_tracing_count():
    459       self._call_counter.called_without_tracing()

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py in _call(self, *args, **kwds)
    501       # This is the first call of __call__, so we have to initialize.
    502       initializer_map = object_identity.ObjectIdentityDictionary()
--> 503       self._initialize(args, kwds, add_initializers_to=initializer_map)
    504     finally:
    505       # At this point we know that the initialization is complete (or less

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
    406     self._concrete_stateful_fn = (
    407         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
--> 408             *args, **kwds))
    409 
    410     def invalid_creator_scope(*unused_args, **unused_kwds):

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
   1846     if self.input_signature:
   1847       args, kwargs = None, None
-> 1848     graph_function, _, _ = self._maybe_define_function(args, kwargs)
   1849     return graph_function
   1850 

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _maybe_define_function(self, args, kwargs)
   2148         graph_function = self._function_cache.primary.get(cache_key, None)
   2149         if graph_function is None:
-> 2150           graph_function = self._create_graph_function(args, kwargs)
   2151           self._function_cache.primary[cache_key] = graph_function
   2152         return graph_function, args, kwargs

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
   2039             arg_names=arg_names,
   2040             override_flat_arg_shapes=override_flat_arg_shapes,
-> 2041             capture_by_value=self._capture_by_value),
   2042         self._function_attributes,
   2043         # Tell the ConcreteFunction to clean up its graph once it goes out of

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    913                                           converted_func)
    914 
--> 915       func_outputs = python_func(*func_args, **func_kwargs)
    916 
    917       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py in wrapped_fn(*args, **kwds)
    356         # __wrapped__ allows AutoGraph to swap in a converted function. We give
    357         # the function a weak reference to itself to avoid a reference cycle.
--> 358         return weak_wrapped_fn().__wrapped__(*args, **kwds)
    359     weak_wrapped_fn = weakref.ref(wrapped_fn)
    360 

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/func_graph.py in wrapper(*args, **kwargs)
    903           except Exception as e:  # pylint:disable=broad-except
    904             if hasattr(e, "ag_error_metadata"):
--> 905               raise e.ag_error_metadata.to_exception(e)
    906             else:
    907               raise

ValueError: in converted code:

    <ipython-input-1-81054f0385cb>:856 train_step  *
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:427 apply_gradients
        grads_and_vars = _filter_grads(grads_and_vars)
    /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:1025 _filter_grads
        ([v.name for _, v in grads_and_vars],))

    ValueError: No gradients provided for any variable: ['transformer_1/encoder_1/embedding_2/embeddings:0', 'transformer_1/encoder_1/encoder_layer_6/multi_head_attention_18/dense_98/kernel:0', 'transformer_1/encoder_1/encoder_layer_6/multi_head_attention_18/dense_98/bias:0', 'transformer_1/encoder_1/encoder_layer_6/multi_head_attention_18/dense_99/kernel:0', 'transformer_1/encoder_1/encoder_layer_6/multi_head_attention_18/dense_99/bias:0', 'transformer_1/encoder_1/encoder_layer_6/multi_head_attention_18/dense_100/kernel:0', 'transformer_1/encoder_1/encoder_layer_6/multi_head_attention_18/dense_100/bias:0', 'transformer_1/encoder_1/encoder_layer_6/multi_head_attention_18/dense_101/kernel:0', 'transformer_1/encoder_1/encoder_layer_6/multi_head_attention_18/dense_101/bias:0', 'transformer_1/encoder_1/encoder_layer_6/sequential_12/dense_102/kernel:0', 'transformer_1/encoder_1/encoder_layer_6/sequential_12/dense_102/bias:0', 'transformer_1/encoder_1/encoder_layer_6/sequential_12/dense_103/kernel:0', 'transformer_1/encoder_1/encoder_layer_6/sequential_12/dense_103/bias:0', 'transformer_1/encoder_1/encoder_layer_6/layer_normalization_30/gamma:0', 'transformer_1/encoder_1/encoder_layer_6/layer_normalization_30/beta:0', 'transformer_1/encoder_1/encoder_layer_6/layer_normalization_31/gamma:0', 'transformer_1/encoder_1/encoder_layer_6/layer_normalization_31/beta:0', 'transformer_1/encoder_1/encoder_layer_7/multi_head_attention_19/dense_104/kernel:0', 'transformer_1/encoder_1/encoder...

是的,GradientTape 有点烦人。您不能对磁带上下文 (with...) 之外的张量进行 任何操作 ,否则磁带将 "lose track"。您可以通过简单地将添加移动到上下文中来修复它:

with tf.GradientTape(persistent=True) as tape:
    classfication, predictions, _ = transformer(inp, tar_inp,
                                                True,
                                                enc_padding_mask,
                                                combined_mask,
                                                dec_padding_mask)
    loss = loss_function(tar_real, predictions)
    loss2 = tf.nn.softmax_cross_entropy_with_logits(label, classfication)
    added_loss = loss + loss2

#print(loss,loss2)
a=tape.gradient(loss,trainsformer.trainable_variable)
gradients = tape.gradient(added_loss, transformer.trainable_variables)