如何在 Kaggle Notebook 的 GPU 上运行 Tensorflow 的 Keras model.fit() 函数？

Question

我想运行我的代码在 Kaggle 提供的 GPU 上。我可以运行我的代码在 CPU 上，但我猜无法将它正确迁移到 Kaggle GPU 上的运行。

在运行宁此

with tf.device("/device:GPU:0"):
hist = model.fit(x=X_train, y=Y_train, validation_data=(X_test, Y_test), batch_size=25, epochs=20, callbacks=callbacks_list)

并收到此错误

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-28-cdb8b009cd85> in <module>
      1 with tf.device("/device:GPU:0"):
----> 2     hist = model.fit(x=X_train, y=Y_train, validation_data=(X_test, Y_test), batch_size=25, epochs=20, callbacks=callbacks_list)

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    817     self._assert_compile_was_called()
    818     self._check_call_args('evaluate')
--> 819 
    820     func = self._select_training_loop(x)
    821     return func.evaluate(

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    233 
    234       recreate_training_iterator = (
--> 235           training_data_adapter.should_recreate_iterator(steps_per_epoch))
    236       if not steps_per_epoch:
    237         # TODO(b/139762795): Add step inference for when steps is None to

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
    591                     class_weights=None,
    592                     shuffle=False,
--> 593                     steps=None,
    594                     distribution_strategy=None,
    595                     max_queue_size=10,

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_inputs(model, mode, x, y, batch_size, epochs, sample_weights, class_weights, shuffle, steps, distribution_strategy, max_queue_size, workers, use_multiprocessing)
    704     """Provide a scope for running one batch."""
    705     batch_logs = {'batch': step, 'size': size}
--> 706     self.callbacks._call_batch_hook(
    707         mode, 'begin', step, batch_logs)
    708     self.progbar.on_batch_begin(step, batch_logs)

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, sample_weight_modes, batch_size, epochs, steps, shuffle, **kwargs)
    355     sample_weights = _process_numpy_inputs(sample_weights)
    356 
--> 357     # If sample_weights are not specified for an output use 1.0 as weights.
    358     if (sample_weights is not None and
    359         any([sw is None for sw in sample_weights])):

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in slice_inputs(self, indices_dataset, inputs)
    381     if steps and not batch_size:
    382       batch_size = int(math.ceil(num_samples/steps))
--> 383 
    384     if not batch_size:
    385       raise ValueError(

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in from_tensors(tensors)
    564       existing iterators.
    565 
--> 566       Args:
    567         unused_dummy: Ignored value.
    568 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in __init__(self, element)
   2763       init_args: A nested structure representing the arguments to `init_func`.
   2764       init_func: A TensorFlow function that will be called on `init_args` each
-> 2765         time a C++ iterator over this dataset is constructed. Returns a nested
   2766         structure representing the "state" of the dataset.
   2767       next_func: A TensorFlow function that will be called on the result of

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/util/structure.py in normalize_element(element)
    111               ops.convert_to_tensor(t, name="component_%d" % i))
    112   return nest.pack_sequence_as(element, normalized_components)
--> 113 
    114 
    115 def convert_legacy_structure(output_types, output_shapes, output_classes):

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
   1312     return ret
   1313   raise TypeError("%sCannot convert %r with type %s to Tensor: "
-> 1314                   "no conversion function registered." %
   1315                   (_error_prefix(name), value, type(value)))
   1316 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/tensor_conversion_registry.py in _default_conversion_function(***failed resolving arguments***)
     50 def _default_conversion_function(value, dtype, name, as_ref):
     51   del as_ref  # Unused.
---> 52   return constant_op.constant(value, dtype, name=name)
     53 
     54 

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in constant(value, dtype, shape, name)
    256         return _eager_fill(shape.as_list(), t, ctx)
    257     raise TypeError("Eager execution of tf.constant with unsupported shape "
--> 258                     "(value has %d elements, shape is %s with %d elements)." %
    259                     (num_t, shape, shape.num_elements()))
    260   g = ops.get_default_graph()

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in _constant_impl(value, dtype, shape, name, verify_shape, allow_broadcast)
    264           value, dtype=dtype, shape=shape, verify_shape=verify_shape,
    265           allow_broadcast=allow_broadcast))
--> 266   dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
    267   const_tensor = g.create_op(
    268       "Const", [], [dtype_value.type],

/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
     94       dtype = dtypes.as_dtype(dtype).as_datatype_enum
     95   ctx.ensure_initialized()
---> 96   return ops.EagerTensor(value, ctx.device_name, dtype)
     97 
     98 

RuntimeError: Can't copy Tensor with type string to device /job:localhost/replica:0/task:0/device:GPU:0.

我也试过安装不同的tensorflow版本，比如最新的tensorflow, tensorflow-gpu, tensorflow-gpu=1.12，但是没有成功。

虽然我可以使用

列出 CPUs 和 GPU

from tensorflow.python.client import device_lib print(device_lib.list_local_devices())

请帮忙！

Answer 1

我终于让它工作了。张量流中存在一些未知错误。它在 tf-nightly build 中正常工作。

Answer 2

当我运行这样的模型时，它对我来说效果很好。

tf.debugging.set_log_device_placement(True)

try:
    with tf.device('/device:XLA_GPU:0'):
        X_train = tf.convert_to_tensor(x_train, dtype=tf.int32)
        Y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
        X_dev = tf.convert_to_tensor(x_val, dtype=tf.int32)
        Y_dev = tf.convert_to_tensor(y_val, dtype=tf.float32)
        _model = tf.keras.Model(review_input, preds)
        opt = optimizers.Adam()
        _model.compile(loss="mean_absolute_error", optimizer=opt, metrics=['acc'])
except RuntimeError as e:
  print(e)
history=_model.fit(X_train, Y_train, epochs=100, batch_size=128, validation_data=(X_dev, Y_dev), verbose=1)

如何在 Kaggle Notebook 的 GPU 上运行 Tensorflow 的 Keras model.fit() 函数？

How to run Tensorflow's Keras model.fit() function on GPU in Kaggle Notebook?

gpu

keras

kaggle

如何在 Kaggle Notebook 的 GPU 上 运行 Tensorflow 的 Keras model.fit() 函数？

How to run Tensorflow's Keras model.fit() function on GPU in Kaggle Notebook?

gpu

keras

kaggle

如何在 Kaggle Notebook 的 GPU 上运行 Tensorflow 的 Keras model.fit() 函数？