使用 TPU 模式时如何从 Google Colaboratory 保存 Tensorflow Checkpoint 文件?
How to save a Tensorflow Checkpoint file from Google Colaboratory in when using TPU mode?
当我使用 saver = tf.train.Saver()
和 save_path = saver.save(session, "checkpointsFolder/checkpoint.ckpt")
我收到 UnimplementedError (see above for traceback): File system scheme '[local]' not implemented
错误
这是完整的错误
---------------------------------------------------------------------------
UnimplementedError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 try:
-> 1334 return fn(*args)
1335 except errors.OpError as e:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1318 return self._call_tf_sessionrun(
-> 1319 options, feed_dict, fetch_list, target_list, run_metadata)
1320
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1406 self._session, options, feed_dict, fetch_list, target_list,
-> 1407 run_metadata)
1408
UnimplementedError: File system scheme '[local]' not implemented (file: 'checkpointsBook2Vec5Inputs')
[[{{node save/SaveV2}} = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:tpu_worker/replica:0/task:0/device:CPU:0"](_recv_save/Const_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, embeddings, embeddings/Shampoo, embeddings/Shampoo_1, embeddings/Shampoo_2, epochCount, softmax_biases, softmax_weights, softmax_weights/Shampoo, softmax_weights/Shampoo_1, softmax_weights/Shampoo_2)]]
During handling of the above exception, another exception occurred:
UnimplementedError Traceback (most recent call last)
<ipython-input-22-ca87cd5e5739> in <module>()
48 print('recEpoch_indexA is', recEpoch_indexA)
49
---> 50 save_path = saver.save(session, "checkpointsBook2Vec5Inputs/Research2VecCS4.ckpt") #Save checkpoint
51 print( 'epochCount.eval() is ', epochCount.eval() )
52
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py in save(self, sess, save_path, global_step, latest_filename, meta_graph_suffix, write_meta_graph, write_state, strip_default_attrs)
1439 model_checkpoint_path = sess.run(
1440 self.saver_def.save_tensor_name,
-> 1441 {self.saver_def.filename_tensor_name: checkpoint_file})
1442
1443 model_checkpoint_path = compat.as_str(model_checkpoint_path)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
927 try:
928 result = self._run(None, fetches, feed_dict, options_ptr,
--> 929 run_metadata_ptr)
930 if run_metadata:
931 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1150 if final_fetches or final_targets or (handle and feed_dict_tensor):
1151 results = self._do_run(handle, final_targets, final_fetches,
-> 1152 feed_dict_tensor, options, run_metadata)
1153 else:
1154 results = []
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1326 if handle is None:
1327 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1328 run_metadata)
1329 else:
1330 return self._do_call(_prun_fn, handle, feeds, fetches)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1346 pass
1347 message = error_interpolation.interpolate(message, self._graph)
-> 1348 raise type(e)(node_def, op, message)
1349
1350 def _extend_graph(self):
UnimplementedError: File system scheme '[local]' not implemented (file: 'checkpointsBook2Vec5Inputs')
[[node save/SaveV2 (defined at <ipython-input-15-c14caac2081d>:45) = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:tpu_worker/replica:0/task:0/device:CPU:0"](_recv_save/Const_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, embeddings, embeddings/Shampoo, embeddings/Shampoo_1, embeddings/Shampoo_2, epochCount, softmax_biases, softmax_weights, softmax_weights/Shampoo, softmax_weights/Shampoo_1, softmax_weights/Shampoo_2)]]
Caused by op 'save/SaveV2', defined at:
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-15-c14caac2081d>", line 45, in <module>
saver = tf.train.Saver()
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1102, in __init__
self.build()
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1114, in build
self._build(self._filename, build_save=True, build_restore=True)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1151, in _build
build_save=build_save, build_restore=build_restore)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 792, in _build_internal
save_tensor = self._AddSaveOps(filename_tensor, saveables)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 284, in _AddSaveOps
save = self.save_op(filename_tensor, saveables)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 202, in save_op
tensors)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_io_ops.py", line 1690, in save_v2
shape_and_slices=shape_and_slices, tensors=tensors, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
self._traceback = tf_stack.extract_stack()
UnimplementedError (see above for traceback): File system scheme '[local]' not implemented (file: 'checkpointsBook2Vec5Inputs')
[[node save/SaveV2 (defined at <ipython-input-15-c14caac2081d>:45) = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:tpu_worker/replica:0/task:0/device:CPU:0"](_recv_save/Const_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, embeddings, embeddings/Shampoo, embeddings/Shampoo_1, embeddings/Shampoo_2, epochCount, softmax_biases, softmax_weights, softmax_weights/Shampoo, softmax_weights/Shampoo_1, softmax_weights/Shampoo_2)]]
查找此错误,我发现以下内容:
来自Google官方TPU调试指南
https://cloud.google.com/tpu/docs/troubleshooting
Error Message
InvalidArgumentError: Unimplemented: File system scheme '[local]' not
implemented
Details
All input files and the model directory must use a cloud storage
bucket path (gs://bucket-name/...), and this bucket must be accessible
from the TPU server. Note that all data processing and model
checkpointing is performed on the TPU server, not the local machine.
For information on how to properly configure cloud storage for use
with the TPU, see the guide Connecting to Cloud Storage Buckets.
其他人有类似的问题
TPU local Filesystem doesn't exist?
The local filesystem is not available on Cloud TPU's. Model
directories (checkpoints etc) and input data should be stored in
Google Cloud Storage (and prefixed with "gs://").
More details here
但是,我没有 Google 云服务,我只是在使用 Google Colab。有没有办法在 TPU 模式下保存 Tensorflow 检查点?
您可以在 free tier and then create a GCS bucket 下创建一个 Google 云帐户。完成此操作后,您可以在 Colab 中对自己进行身份验证,以通过执行以下操作从 Colab 获得对您的 GCS 存储桶的写入权限:
from google.colab import auth
auth.authenticate_user()
这是一个使用云 TPU 和 GCS 的 sample Colab notebook。
另一种方法是使用 Keras 重写模型并使用 tf.contrib.tpu.keras_to_tpu_model(..) 和 tf.contrib.tpu.TPUDistributionStrategy(...)。这是一个小代码片段:
def get_model():
return keras.Sequential([
keras.layers.Dense(10, input_shape=(4,), activation=tf.nn.relu, name = "Dense_1"),
keras.layers.Dense(10, activation=tf.nn.relu, name = "Dense_2"),
keras.layers.Dense(3, activation=None, name = "logits"),
keras.layers.Dense(3, activation=tf.nn.softmax, name = "softmax")
])
dnn_model = get_model()
dnn_model.compile(optimizer=tf.train.AdagradOptimizer(learning_rate=0.1),
loss='sparse_categorical_crossentropy',
metrics=['sparse_categorical_crossentropy'])
tpu_model = tf.contrib.tpu.keras_to_tpu_model(
dnn_model,
strategy=tf.contrib.tpu.TPUDistributionStrategy(
tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)))
# Train the model
tpu_model.fit(
train_x, train_y,
steps_per_epoch = steps_per_epoch,
epochs=epochs,
)
tpu_model.save_weights('./saved_weights.h5', overwrite=True)
当我使用 saver = tf.train.Saver()
和 save_path = saver.save(session, "checkpointsFolder/checkpoint.ckpt")
我收到 UnimplementedError (see above for traceback): File system scheme '[local]' not implemented
错误
这是完整的错误
---------------------------------------------------------------------------
UnimplementedError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 try:
-> 1334 return fn(*args)
1335 except errors.OpError as e:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1318 return self._call_tf_sessionrun(
-> 1319 options, feed_dict, fetch_list, target_list, run_metadata)
1320
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1406 self._session, options, feed_dict, fetch_list, target_list,
-> 1407 run_metadata)
1408
UnimplementedError: File system scheme '[local]' not implemented (file: 'checkpointsBook2Vec5Inputs')
[[{{node save/SaveV2}} = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:tpu_worker/replica:0/task:0/device:CPU:0"](_recv_save/Const_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, embeddings, embeddings/Shampoo, embeddings/Shampoo_1, embeddings/Shampoo_2, epochCount, softmax_biases, softmax_weights, softmax_weights/Shampoo, softmax_weights/Shampoo_1, softmax_weights/Shampoo_2)]]
During handling of the above exception, another exception occurred:
UnimplementedError Traceback (most recent call last)
<ipython-input-22-ca87cd5e5739> in <module>()
48 print('recEpoch_indexA is', recEpoch_indexA)
49
---> 50 save_path = saver.save(session, "checkpointsBook2Vec5Inputs/Research2VecCS4.ckpt") #Save checkpoint
51 print( 'epochCount.eval() is ', epochCount.eval() )
52
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py in save(self, sess, save_path, global_step, latest_filename, meta_graph_suffix, write_meta_graph, write_state, strip_default_attrs)
1439 model_checkpoint_path = sess.run(
1440 self.saver_def.save_tensor_name,
-> 1441 {self.saver_def.filename_tensor_name: checkpoint_file})
1442
1443 model_checkpoint_path = compat.as_str(model_checkpoint_path)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
927 try:
928 result = self._run(None, fetches, feed_dict, options_ptr,
--> 929 run_metadata_ptr)
930 if run_metadata:
931 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1150 if final_fetches or final_targets or (handle and feed_dict_tensor):
1151 results = self._do_run(handle, final_targets, final_fetches,
-> 1152 feed_dict_tensor, options, run_metadata)
1153 else:
1154 results = []
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1326 if handle is None:
1327 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1328 run_metadata)
1329 else:
1330 return self._do_call(_prun_fn, handle, feeds, fetches)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1346 pass
1347 message = error_interpolation.interpolate(message, self._graph)
-> 1348 raise type(e)(node_def, op, message)
1349
1350 def _extend_graph(self):
UnimplementedError: File system scheme '[local]' not implemented (file: 'checkpointsBook2Vec5Inputs')
[[node save/SaveV2 (defined at <ipython-input-15-c14caac2081d>:45) = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:tpu_worker/replica:0/task:0/device:CPU:0"](_recv_save/Const_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, embeddings, embeddings/Shampoo, embeddings/Shampoo_1, embeddings/Shampoo_2, epochCount, softmax_biases, softmax_weights, softmax_weights/Shampoo, softmax_weights/Shampoo_1, softmax_weights/Shampoo_2)]]
Caused by op 'save/SaveV2', defined at:
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-15-c14caac2081d>", line 45, in <module>
saver = tf.train.Saver()
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1102, in __init__
self.build()
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1114, in build
self._build(self._filename, build_save=True, build_restore=True)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1151, in _build
build_save=build_save, build_restore=build_restore)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 792, in _build_internal
save_tensor = self._AddSaveOps(filename_tensor, saveables)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 284, in _AddSaveOps
save = self.save_op(filename_tensor, saveables)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 202, in save_op
tensors)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_io_ops.py", line 1690, in save_v2
shape_and_slices=shape_and_slices, tensors=tensors, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
self._traceback = tf_stack.extract_stack()
UnimplementedError (see above for traceback): File system scheme '[local]' not implemented (file: 'checkpointsBook2Vec5Inputs')
[[node save/SaveV2 (defined at <ipython-input-15-c14caac2081d>:45) = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:tpu_worker/replica:0/task:0/device:CPU:0"](_recv_save/Const_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, embeddings, embeddings/Shampoo, embeddings/Shampoo_1, embeddings/Shampoo_2, epochCount, softmax_biases, softmax_weights, softmax_weights/Shampoo, softmax_weights/Shampoo_1, softmax_weights/Shampoo_2)]]
查找此错误,我发现以下内容:
来自Google官方TPU调试指南
https://cloud.google.com/tpu/docs/troubleshooting
Error Message
InvalidArgumentError: Unimplemented: File system scheme '[local]' not implemented
Details
All input files and the model directory must use a cloud storage bucket path (gs://bucket-name/...), and this bucket must be accessible from the TPU server. Note that all data processing and model checkpointing is performed on the TPU server, not the local machine. For information on how to properly configure cloud storage for use with the TPU, see the guide Connecting to Cloud Storage Buckets.
其他人有类似的问题
TPU local Filesystem doesn't exist?
The local filesystem is not available on Cloud TPU's. Model directories (checkpoints etc) and input data should be stored in Google Cloud Storage (and prefixed with "gs://").
More details here
但是,我没有 Google 云服务,我只是在使用 Google Colab。有没有办法在 TPU 模式下保存 Tensorflow 检查点?
您可以在 free tier and then create a GCS bucket 下创建一个 Google 云帐户。完成此操作后,您可以在 Colab 中对自己进行身份验证,以通过执行以下操作从 Colab 获得对您的 GCS 存储桶的写入权限:
from google.colab import auth
auth.authenticate_user()
这是一个使用云 TPU 和 GCS 的 sample Colab notebook。
另一种方法是使用 Keras 重写模型并使用 tf.contrib.tpu.keras_to_tpu_model(..) 和 tf.contrib.tpu.TPUDistributionStrategy(...)。这是一个小代码片段:
def get_model():
return keras.Sequential([
keras.layers.Dense(10, input_shape=(4,), activation=tf.nn.relu, name = "Dense_1"),
keras.layers.Dense(10, activation=tf.nn.relu, name = "Dense_2"),
keras.layers.Dense(3, activation=None, name = "logits"),
keras.layers.Dense(3, activation=tf.nn.softmax, name = "softmax")
])
dnn_model = get_model()
dnn_model.compile(optimizer=tf.train.AdagradOptimizer(learning_rate=0.1),
loss='sparse_categorical_crossentropy',
metrics=['sparse_categorical_crossentropy'])
tpu_model = tf.contrib.tpu.keras_to_tpu_model(
dnn_model,
strategy=tf.contrib.tpu.TPUDistributionStrategy(
tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)))
# Train the model
tpu_model.fit(
train_x, train_y,
steps_per_epoch = steps_per_epoch,
epochs=epochs,
)
tpu_model.save_weights('./saved_weights.h5', overwrite=True)