使用 GPU 时未找到 Tensorflow _tpu_ops.so

Tensorflow _tpu_ops.so not found while using GPU

我将这个 BERT NER github 代码移植到 google colab,在那里我手动将标志设置为 运行 它 (https://github.com/kyzhouhzau/BERT-NER)。

我把use_tpu设为False,所以应该是用GPU。

flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")

colab上使用的TF版本是1.13.1,命令tf.test.gpu_device_name() returns '/device:GPU:0'.

这是我在 运行ning tf.app.run() 时收到的错误消息。失败是因为它正在寻找 TPU 吗?我该如何解决?感谢您的帮助!

---------------------------------------------------------------------------
NotFoundError                             Traceback (most recent call last)
<ipython-input-53-d10a9cf14e41> in <module>()
----> 1 tf.app.run()

/usr/local/lib/python3.6/dist-packages/tensorflow/python/platform/app.py in run(main, argv)
    123   # Call the main function, passing through any arguments
    124   # to the final program.
--> 125   _sys.exit(main(argv))
    126 

<ipython-input-26-fed5e3d99ff6> in main(_)
     29             FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
     30 
---> 31     is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
     32 
     33     run_config = tf.contrib.tpu.RunConfig(

/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/lazy_loader.py in __getattr__(self, item)
     59 
     60   def __getattr__(self, item):
---> 61     module = self._load()
     62     return getattr(module, item)
     63 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/lazy_loader.py in _load(self)
     42     """Load the module and insert it into the parent's globals."""
     43     # Import the target module and insert it into the parent's namespace
---> 44     module = importlib.import_module(self.__name__)
     45     self._parent_module_globals[self._local_name] = module
     46 

/usr/lib/python3.6/importlib/__init__.py in import_module(name, package)
    124                 break
    125             level += 1
--> 126     return _bootstrap._gcd_import(name[level:], package, level)
    127 
    128 

/usr/lib/python3.6/importlib/_bootstrap.py in _gcd_import(name, package, level)

/usr/lib/python3.6/importlib/_bootstrap.py in _find_and_load(name, import_)

/usr/lib/python3.6/importlib/_bootstrap.py in _find_and_load_unlocked(name, import_)

/usr/lib/python3.6/importlib/_bootstrap.py in _load_unlocked(spec)

/usr/lib/python3.6/importlib/_bootstrap_external.py in exec_module(self, module)

/usr/lib/python3.6/importlib/_bootstrap.py in _call_with_frames_removed(f, *args, **kwds)

/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/__init__.py in <module>()
     38 from tensorflow.contrib import data
     39 from tensorflow.contrib import deprecated
---> 40 from tensorflow.contrib import distribute
     41 from tensorflow.contrib import distributions
     42 from tensorflow.contrib import estimator

/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/distribute/__init__.py in <module>()
     31 from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy
     32 from tensorflow.contrib.distribute.python.step_fn import *
---> 33 from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy
     34 from tensorflow.python.distribute.cross_device_ops import *
     35 from tensorflow.python.distribute.distribute_config import DistributeConfig

/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/distribute/python/tpu_strategy.py in <module>()
     25 import functools
     26 
---> 27 from tensorflow.contrib.tpu.python.ops import tpu_ops
     28 from tensorflow.contrib.tpu.python.tpu import tpu
     29 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib

/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/__init__.py in <module>()
     67 # pylint: disable=wildcard-import,unused-import
     68 from tensorflow.contrib.tpu.python import profiler
---> 69 from tensorflow.contrib.tpu.python.ops.tpu_ops import *
     70 from tensorflow.contrib.tpu.python.tpu.async_checkpoint import *
     71 from tensorflow.contrib.tpu.python.tpu.bfloat16 import *

/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/ops/tpu_ops.py in <module>()
     37 
     38   _tpu_ops = loader.load_op_library(
---> 39       resource_loader.get_path_to_datafile("_tpu_ops.so"))
     40 
     41   def _create_default_group_assignment():

/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/util/loader.py in load_op_library(path)
     54       return None
     55   path = resource_loader.get_path_to_datafile(path)
---> 56   ret = load_library.load_op_library(path)
     57   assert ret, 'Could not load %s' % path
     58   return ret

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/load_library.py in load_op_library(library_filename)
     59     RuntimeError: when unable to load the library or get the python wrappers.
     60   """
---> 61   lib_handle = py_tf.TF_LoadLibrary(library_filename)
     62 
     63   op_list_str = py_tf.TF_GetOpList(lib_handle)

NotFoundError: /usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/ops/_tpu_ops.so: undefined symbol: _ZN6google8protobuf5Arena18CreateMaybeMessageIN10tensorflow9AttrValueEIEEEPT_PS1_DpOT0_

我明白了。当我从 https://github.com/guillaumegenthial/tf_metrics.git using !pip install git+https://github.com/guillaumegenthial/tf_metrics.git 下载 tf_metrics 库时,它以某种方式重新安装了 tensorflow-gpu,我猜它损坏了它。

我单独下载了 tf_metrics.py,它现在正在 google colab 上工作。