如何在 Kaggle Notebook 的 GPU 上 运行 Tensorflow 的 Keras model.fit() 函数?
How to run Tensorflow's Keras model.fit() function on GPU in Kaggle Notebook?
我想 运行 我的代码在 Kaggle 提供的 GPU 上。我可以 运行 我的代码在 CPU 上,但我猜无法将它正确迁移到 Kaggle GPU 上的 运行。
在运行宁此
with tf.device("/device:GPU:0"):
hist = model.fit(x=X_train, y=Y_train, validation_data=(X_test, Y_test), batch_size=25, epochs=20, callbacks=callbacks_list)
并收到此错误
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-28-cdb8b009cd85> in <module>
1 with tf.device("/device:GPU:0"):
----> 2 hist = model.fit(x=X_train, y=Y_train, validation_data=(X_test, Y_test), batch_size=25, epochs=20, callbacks=callbacks_list)
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 self._assert_compile_was_called()
818 self._check_call_args('evaluate')
--> 819
820 func = self._select_training_loop(x)
821 return func.evaluate(
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
233
234 recreate_training_iterator = (
--> 235 training_data_adapter.should_recreate_iterator(steps_per_epoch))
236 if not steps_per_epoch:
237 # TODO(b/139762795): Add step inference for when steps is None to
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
591 class_weights=None,
592 shuffle=False,
--> 593 steps=None,
594 distribution_strategy=None,
595 max_queue_size=10,
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_inputs(model, mode, x, y, batch_size, epochs, sample_weights, class_weights, shuffle, steps, distribution_strategy, max_queue_size, workers, use_multiprocessing)
704 """Provide a scope for running one batch."""
705 batch_logs = {'batch': step, 'size': size}
--> 706 self.callbacks._call_batch_hook(
707 mode, 'begin', step, batch_logs)
708 self.progbar.on_batch_begin(step, batch_logs)
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, sample_weight_modes, batch_size, epochs, steps, shuffle, **kwargs)
355 sample_weights = _process_numpy_inputs(sample_weights)
356
--> 357 # If sample_weights are not specified for an output use 1.0 as weights.
358 if (sample_weights is not None and
359 any([sw is None for sw in sample_weights])):
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in slice_inputs(self, indices_dataset, inputs)
381 if steps and not batch_size:
382 batch_size = int(math.ceil(num_samples/steps))
--> 383
384 if not batch_size:
385 raise ValueError(
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in from_tensors(tensors)
564 existing iterators.
565
--> 566 Args:
567 unused_dummy: Ignored value.
568
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in __init__(self, element)
2763 init_args: A nested structure representing the arguments to `init_func`.
2764 init_func: A TensorFlow function that will be called on `init_args` each
-> 2765 time a C++ iterator over this dataset is constructed. Returns a nested
2766 structure representing the "state" of the dataset.
2767 next_func: A TensorFlow function that will be called on the result of
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/util/structure.py in normalize_element(element)
111 ops.convert_to_tensor(t, name="component_%d" % i))
112 return nest.pack_sequence_as(element, normalized_components)
--> 113
114
115 def convert_legacy_structure(output_types, output_shapes, output_classes):
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
1312 return ret
1313 raise TypeError("%sCannot convert %r with type %s to Tensor: "
-> 1314 "no conversion function registered." %
1315 (_error_prefix(name), value, type(value)))
1316
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/tensor_conversion_registry.py in _default_conversion_function(***failed resolving arguments***)
50 def _default_conversion_function(value, dtype, name, as_ref):
51 del as_ref # Unused.
---> 52 return constant_op.constant(value, dtype, name=name)
53
54
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in constant(value, dtype, shape, name)
256 return _eager_fill(shape.as_list(), t, ctx)
257 raise TypeError("Eager execution of tf.constant with unsupported shape "
--> 258 "(value has %d elements, shape is %s with %d elements)." %
259 (num_t, shape, shape.num_elements()))
260 g = ops.get_default_graph()
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in _constant_impl(value, dtype, shape, name, verify_shape, allow_broadcast)
264 value, dtype=dtype, shape=shape, verify_shape=verify_shape,
265 allow_broadcast=allow_broadcast))
--> 266 dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
267 const_tensor = g.create_op(
268 "Const", [], [dtype_value.type],
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
94 dtype = dtypes.as_dtype(dtype).as_datatype_enum
95 ctx.ensure_initialized()
---> 96 return ops.EagerTensor(value, ctx.device_name, dtype)
97
98
RuntimeError: Can't copy Tensor with type string to device /job:localhost/replica:0/task:0/device:GPU:0.
我也试过安装不同的tensorflow版本,比如最新的tensorflow, tensorflow-gpu, tensorflow-gpu=1.12,但是没有成功。
虽然我可以使用
列出 CPUs 和 GPU
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
请帮忙!
我终于让它工作了。张量流中存在一些未知错误。它在 tf-nightly build 中正常工作。
当我 运行 这样的模型时,它对我来说效果很好。
tf.debugging.set_log_device_placement(True)
try:
with tf.device('/device:XLA_GPU:0'):
X_train = tf.convert_to_tensor(x_train, dtype=tf.int32)
Y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
X_dev = tf.convert_to_tensor(x_val, dtype=tf.int32)
Y_dev = tf.convert_to_tensor(y_val, dtype=tf.float32)
_model = tf.keras.Model(review_input, preds)
opt = optimizers.Adam()
_model.compile(loss="mean_absolute_error", optimizer=opt, metrics=['acc'])
except RuntimeError as e:
print(e)
history=_model.fit(X_train, Y_train, epochs=100, batch_size=128, validation_data=(X_dev, Y_dev), verbose=1)
我想 运行 我的代码在 Kaggle 提供的 GPU 上。我可以 运行 我的代码在 CPU 上,但我猜无法将它正确迁移到 Kaggle GPU 上的 运行。
在运行宁此
with tf.device("/device:GPU:0"):
hist = model.fit(x=X_train, y=Y_train, validation_data=(X_test, Y_test), batch_size=25, epochs=20, callbacks=callbacks_list)
并收到此错误
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-28-cdb8b009cd85> in <module>
1 with tf.device("/device:GPU:0"):
----> 2 hist = model.fit(x=X_train, y=Y_train, validation_data=(X_test, Y_test), batch_size=25, epochs=20, callbacks=callbacks_list)
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 self._assert_compile_was_called()
818 self._check_call_args('evaluate')
--> 819
820 func = self._select_training_loop(x)
821 return func.evaluate(
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
233
234 recreate_training_iterator = (
--> 235 training_data_adapter.should_recreate_iterator(steps_per_epoch))
236 if not steps_per_epoch:
237 # TODO(b/139762795): Add step inference for when steps is None to
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
591 class_weights=None,
592 shuffle=False,
--> 593 steps=None,
594 distribution_strategy=None,
595 max_queue_size=10,
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_inputs(model, mode, x, y, batch_size, epochs, sample_weights, class_weights, shuffle, steps, distribution_strategy, max_queue_size, workers, use_multiprocessing)
704 """Provide a scope for running one batch."""
705 batch_logs = {'batch': step, 'size': size}
--> 706 self.callbacks._call_batch_hook(
707 mode, 'begin', step, batch_logs)
708 self.progbar.on_batch_begin(step, batch_logs)
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, sample_weight_modes, batch_size, epochs, steps, shuffle, **kwargs)
355 sample_weights = _process_numpy_inputs(sample_weights)
356
--> 357 # If sample_weights are not specified for an output use 1.0 as weights.
358 if (sample_weights is not None and
359 any([sw is None for sw in sample_weights])):
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in slice_inputs(self, indices_dataset, inputs)
381 if steps and not batch_size:
382 batch_size = int(math.ceil(num_samples/steps))
--> 383
384 if not batch_size:
385 raise ValueError(
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in from_tensors(tensors)
564 existing iterators.
565
--> 566 Args:
567 unused_dummy: Ignored value.
568
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in __init__(self, element)
2763 init_args: A nested structure representing the arguments to `init_func`.
2764 init_func: A TensorFlow function that will be called on `init_args` each
-> 2765 time a C++ iterator over this dataset is constructed. Returns a nested
2766 structure representing the "state" of the dataset.
2767 next_func: A TensorFlow function that will be called on the result of
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/util/structure.py in normalize_element(element)
111 ops.convert_to_tensor(t, name="component_%d" % i))
112 return nest.pack_sequence_as(element, normalized_components)
--> 113
114
115 def convert_legacy_structure(output_types, output_shapes, output_classes):
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
1312 return ret
1313 raise TypeError("%sCannot convert %r with type %s to Tensor: "
-> 1314 "no conversion function registered." %
1315 (_error_prefix(name), value, type(value)))
1316
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/tensor_conversion_registry.py in _default_conversion_function(***failed resolving arguments***)
50 def _default_conversion_function(value, dtype, name, as_ref):
51 del as_ref # Unused.
---> 52 return constant_op.constant(value, dtype, name=name)
53
54
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in constant(value, dtype, shape, name)
256 return _eager_fill(shape.as_list(), t, ctx)
257 raise TypeError("Eager execution of tf.constant with unsupported shape "
--> 258 "(value has %d elements, shape is %s with %d elements)." %
259 (num_t, shape, shape.num_elements()))
260 g = ops.get_default_graph()
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in _constant_impl(value, dtype, shape, name, verify_shape, allow_broadcast)
264 value, dtype=dtype, shape=shape, verify_shape=verify_shape,
265 allow_broadcast=allow_broadcast))
--> 266 dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
267 const_tensor = g.create_op(
268 "Const", [], [dtype_value.type],
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
94 dtype = dtypes.as_dtype(dtype).as_datatype_enum
95 ctx.ensure_initialized()
---> 96 return ops.EagerTensor(value, ctx.device_name, dtype)
97
98
RuntimeError: Can't copy Tensor with type string to device /job:localhost/replica:0/task:0/device:GPU:0.
我也试过安装不同的tensorflow版本,比如最新的tensorflow, tensorflow-gpu, tensorflow-gpu=1.12,但是没有成功。
虽然我可以使用
列出 CPUs 和 GPUfrom tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
请帮忙!
我终于让它工作了。张量流中存在一些未知错误。它在 tf-nightly build 中正常工作。
当我 运行 这样的模型时,它对我来说效果很好。
tf.debugging.set_log_device_placement(True)
try:
with tf.device('/device:XLA_GPU:0'):
X_train = tf.convert_to_tensor(x_train, dtype=tf.int32)
Y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
X_dev = tf.convert_to_tensor(x_val, dtype=tf.int32)
Y_dev = tf.convert_to_tensor(y_val, dtype=tf.float32)
_model = tf.keras.Model(review_input, preds)
opt = optimizers.Adam()
_model.compile(loss="mean_absolute_error", optimizer=opt, metrics=['acc'])
except RuntimeError as e:
print(e)
history=_model.fit(X_train, Y_train, epochs=100, batch_size=128, validation_data=(X_dev, Y_dev), verbose=1)