如何在 Kaggle Notebook 的 GPU 上 运行 Tensorflow 的 Keras model.fit() 函数?

How to run Tensorflow's Keras model.fit() function on GPU in Kaggle Notebook?

我想 运行 我的代码在 Kaggle 提供的 GPU 上。我可以 运行 我的代码在 CPU 上,但我猜无法将它正确迁移到 Kaggle GPU 上的 运行。

在运行宁此

with tf.device("/device:GPU:0"):
hist = model.fit(x=X_train, y=Y_train, validation_data=(X_test, Y_test), batch_size=25, epochs=20, callbacks=callbacks_list)

并收到此错误

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-28-cdb8b009cd85> in <module>
      1 with tf.device("/device:GPU:0"):
----> 2     hist = model.fit(x=X_train, y=Y_train, validation_data=(X_test, Y_test), batch_size=25, epochs=20, callbacks=callbacks_list)

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    817     self._assert_compile_was_called()
    818     self._check_call_args('evaluate')
--> 819 
    820     func = self._select_training_loop(x)
    821     return func.evaluate(

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    233 
    234       recreate_training_iterator = (
--> 235           training_data_adapter.should_recreate_iterator(steps_per_epoch))
    236       if not steps_per_epoch:
    237         # TODO(b/139762795): Add step inference for when steps is None to

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
    591                     class_weights=None,
    592                     shuffle=False,
--> 593                     steps=None,
    594                     distribution_strategy=None,
    595                     max_queue_size=10,

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_inputs(model, mode, x, y, batch_size, epochs, sample_weights, class_weights, shuffle, steps, distribution_strategy, max_queue_size, workers, use_multiprocessing)
    704     """Provide a scope for running one batch."""
    705     batch_logs = {'batch': step, 'size': size}
--> 706     self.callbacks._call_batch_hook(
    707         mode, 'begin', step, batch_logs)
    708     self.progbar.on_batch_begin(step, batch_logs)

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, sample_weight_modes, batch_size, epochs, steps, shuffle, **kwargs)
    355     sample_weights = _process_numpy_inputs(sample_weights)
    356 
--> 357     # If sample_weights are not specified for an output use 1.0 as weights.
    358     if (sample_weights is not None and
    359         any([sw is None for sw in sample_weights])):

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in slice_inputs(self, indices_dataset, inputs)
    381     if steps and not batch_size:
    382       batch_size = int(math.ceil(num_samples/steps))
--> 383 
    384     if not batch_size:
    385       raise ValueError(

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in from_tensors(tensors)
    564       existing iterators.
    565 
--> 566       Args:
    567         unused_dummy: Ignored value.
    568 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in __init__(self, element)
   2763       init_args: A nested structure representing the arguments to `init_func`.
   2764       init_func: A TensorFlow function that will be called on `init_args` each
-> 2765         time a C++ iterator over this dataset is constructed. Returns a nested
   2766         structure representing the "state" of the dataset.
   2767       next_func: A TensorFlow function that will be called on the result of

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/util/structure.py in normalize_element(element)
    111               ops.convert_to_tensor(t, name="component_%d" % i))
    112   return nest.pack_sequence_as(element, normalized_components)
--> 113 
    114 
    115 def convert_legacy_structure(output_types, output_shapes, output_classes):

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
   1312     return ret
   1313   raise TypeError("%sCannot convert %r with type %s to Tensor: "
-> 1314                   "no conversion function registered." %
   1315                   (_error_prefix(name), value, type(value)))
   1316 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/tensor_conversion_registry.py in _default_conversion_function(***failed resolving arguments***)
     50 def _default_conversion_function(value, dtype, name, as_ref):
     51   del as_ref  # Unused.
---> 52   return constant_op.constant(value, dtype, name=name)
     53 
     54 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in constant(value, dtype, shape, name)
    256         return _eager_fill(shape.as_list(), t, ctx)
    257     raise TypeError("Eager execution of tf.constant with unsupported shape "
--> 258                     "(value has %d elements, shape is %s with %d elements)." %
    259                     (num_t, shape, shape.num_elements()))
    260   g = ops.get_default_graph()

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in _constant_impl(value, dtype, shape, name, verify_shape, allow_broadcast)
    264           value, dtype=dtype, shape=shape, verify_shape=verify_shape,
    265           allow_broadcast=allow_broadcast))
--> 266   dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
    267   const_tensor = g.create_op(
    268       "Const", [], [dtype_value.type],

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
     94       dtype = dtypes.as_dtype(dtype).as_datatype_enum
     95   ctx.ensure_initialized()
---> 96   return ops.EagerTensor(value, ctx.device_name, dtype)
     97 
     98 

RuntimeError: Can't copy Tensor with type string to device /job:localhost/replica:0/task:0/device:GPU:0.

我也试过安装不同的tensorflow版本,比如最新的tensorflow, tensorflow-gpu, tensorflow-gpu=1.12,但是没有成功。

虽然我可以使用

列出 CPUs 和 GPU

from tensorflow.python.client import device_lib print(device_lib.list_local_devices())

请帮忙!

我终于让它工作了。张量流中存在一些未知错误。它在 tf-nightly build 中正常工作。

当我 运行 这样的模型时,它对我来说效果很好。

tf.debugging.set_log_device_placement(True)

try:
    with tf.device('/device:XLA_GPU:0'):
        X_train = tf.convert_to_tensor(x_train, dtype=tf.int32)
        Y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
        X_dev = tf.convert_to_tensor(x_val, dtype=tf.int32)
        Y_dev = tf.convert_to_tensor(y_val, dtype=tf.float32)
        _model = tf.keras.Model(review_input, preds)
        opt = optimizers.Adam()
        _model.compile(loss="mean_absolute_error", optimizer=opt, metrics=['acc'])
except RuntimeError as e:
  print(e)
history=_model.fit(X_train, Y_train, epochs=100, batch_size=128, validation_data=(X_dev, Y_dev), verbose=1)