运行 使用 tensorflow 2.0 的 RNN LSTM 模型时出现错误

I'm getting an error while running the RNN LSTM model with tensorflow 2.0

几个月前我安装了tensorflow 2.0。我成功地 运行 CNN、线性回归和其他 keras 模型。我最近在 tensorflow 2.0 RNN with keras tutorials 学习 RNN。我 运行 本教程中的以下代码:

import collections
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf

from tensorflow.keras import layers
batch_size = 64
input_dim = 28
units = 64
output_size = 10

def build_model(allow_cudnn_kernel=True):
  if allow_cudnn_kernel:
    lstm_layer = tf.keras.layers.LSTM(units, input_shape=(None, input_dim))
  else:
    lstm_layer = tf.keras.layers.RNN(
        tf.keras.layers.LSTMCell(units),
        input_shape=(None, input_dim))
  model = tf.keras.models.Sequential([
      lstm_layer,
      tf.keras.layers.BatchNormalization(),
      tf.keras.layers.Dense(output_size, activation='softmax')]
  )
  return model

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
sample, sample_label = x_train[0], y_train[0]
model = build_model(allow_cudnn_kernel=True)

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='sgd',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=batch_size,
          epochs=5)

这是我得到的输出:

Train on 60000 samples, validate on 10000 samples
Epoch 1/5
   64/60000 [..............................] - ETA: 1:17:13
---------------------------------------------------------------------------
UnknownError                              Traceback (most recent call last)
<ipython-input-8-6a1ac7233ae1> in <module>()
     31           validation_data=(x_test, y_test),
     32           batch_size=batch_size,
---> 33           epochs=5)

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    726         max_queue_size=max_queue_size,
    727         workers=workers,
--> 728         use_multiprocessing=use_multiprocessing)
    729 
    730   def evaluate(self,

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\keras\engine\training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
    322                 mode=ModeKeys.TRAIN,
    323                 training_context=training_context,
--> 324                 total_epochs=epochs)
    325             cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
    326 

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\keras\engine\training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
    121         step=step, mode=mode, size=current_batch_size) as batch_logs:
    122       try:
--> 123         batch_outs = execution_function(iterator)
    124       except (StopIteration, errors.OutOfRangeError):
    125         # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\keras\engine\training_v2_utils.py in execution_function(input_fn)
     84     # `numpy` translates Tensors to values in Eager mode.
     85     return nest.map_structure(_non_none_constant_value,
---> 86                               distributed_function(input_fn))
     87 
     88   return execution_function

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\eager\def_function.py in __call__(self, *args, **kwds)
    455 
    456     tracing_count = self._get_tracing_count()
--> 457     result = self._call(*args, **kwds)
    458     if tracing_count == self._get_tracing_count():
    459       self._call_counter.called_without_tracing()

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\eager\def_function.py in _call(self, *args, **kwds)
    518         # Lifting succeeded, so variables are initialized and we can run the
    519         # stateless function.
--> 520         return self._stateless_fn(*args, **kwds)
    521     else:
    522       canon_args, canon_kwds = \

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\eager\function.py in __call__(self, *args, **kwargs)
   1821     """Calls a graph function specialized to the inputs."""
   1822     graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 1823     return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
   1824 
   1825   @property

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\eager\function.py in _filtered_call(self, args, kwargs)
   1139          if isinstance(t, (ops.Tensor,
   1140                            resource_variable_ops.BaseResourceVariable))),
-> 1141         self.captured_inputs)
   1142 
   1143   def _call_flat(self, args, captured_inputs, cancellation_manager=None):

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
   1222     if executing_eagerly:
   1223       flat_outputs = forward_function.call(
-> 1224           ctx, args, cancellation_manager=cancellation_manager)
   1225     else:
   1226       gradient_name = self._delayed_rewrite_functions.register()

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\eager\function.py in call(self, ctx, args, cancellation_manager)
    509               inputs=args,
    510               attrs=("executor_type", executor_type, "config_proto", config),
--> 511               ctx=ctx)
    512         else:
    513           outputs = execute.execute_with_cancellation(

~\AppData\Roaming\Python\Python35\site-packages\tensorflow_core\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     65     else:
     66       message = e.message
---> 67     six.raise_from(core._status_to_exception(e.code, message), None)
     68   except TypeError as e:
     69     keras_symbolic_tensors = [

c:\users\gokul adethya\appdata\local\programs\python\python35\lib\site-packages\six.py in raise_from(value, from_value)

UnknownError:  [_Derived_]  Fail to find the dnn implementation.
     [[{{node CudnnRNN}}]]
     [[sequential_1/lstm_1/StatefulPartitionedCall]] [Op:__inference_distributed_function_6815]

Function call stack:
distributed_function -> distributed_function -> distributed_function

我研究了错误并发现 this. I tried setting growth to true, which I didn't actually understand how it works, but I still tried it by inserting code from https://www.tensorflow.org/guide/gpu,它仍然导致相同的错误。

配置:

tensorflow-gpu 版本 -- 2.0.0 CUDA 版本 - v10.0

我终于找到了 problem.The 问题是我的 Cudnn 低于 tensorflow 推荐的版本 >=7.4.1 。当我将 ot 升级到最新发布的版本时,它已修复