HuggingFace AutoTokenizer | ValueError: Couldn't instantiate the backend tokenizer
HuggingFace AutoTokenizer | ValueError: Couldn't instantiate the backend tokenizer
目标:修改此 Notebook 以使用 albert-base-v2 模型
第 1.3 节出现错误。
内核:conda_pytorch_p36
。我做了 Restart & 运行 All,并刷新了工作目录中的文件视图。
列出了 3 种可能导致此错误的方法。我不确定我的情况属于哪个。
第 1.3 节:
# define the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
configs.output_dir, do_lower_case=configs.do_lower_case)
回溯:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-25-1f864e3046eb> in <module>
140 # define the tokenizer
141 tokenizer = AutoTokenizer.from_pretrained(
--> 142 configs.output_dir, do_lower_case=configs.do_lower_case)
143
144 # Evaluate the original FP32 BERT model
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
548 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
549 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 550 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
551 else:
552 if tokenizer_class_py is not None:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1752 use_auth_token=use_auth_token,
1753 cache_dir=cache_dir,
-> 1754 **kwargs,
1755 )
1756
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1880 # Instantiate tokenizer.
1881 try:
-> 1882 tokenizer = cls(*init_inputs, **init_kwargs)
1883 except OSError:
1884 raise OSError(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/albert/tokenization_albert_fast.py in __init__(self, vocab_file, tokenizer_file, do_lower_case, remove_space, keep_accents, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, **kwargs)
159 cls_token=cls_token,
160 mask_token=mask_token,
--> 161 **kwargs,
162 )
163
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_fast.py in __init__(self, *args, **kwargs)
116 else:
117 raise ValueError(
--> 118 "Couldn't instantiate the backend tokenizer from one of: \n"
119 "(1) a `tokenizers` library serialization file, \n"
120 "(2) a slow tokenizer instance to convert or \n"
ValueError: Couldn't instantiate the backend tokenizer from one of:
(1) a `tokenizers` library serialization file,
(2) a slow tokenizer instance to convert or
(3) an equivalent slow tokenizer class to instantiate and convert.
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.
如果我还有什么要补充的,请告诉我 post。
首先,我必须pip install sentencepiece
。
但是,在同一行代码中,我收到 sentencepiece
错误。
围绕两个参数包装 str()
产生相同的回溯。
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-12-1f864e3046eb> in <module>
140 # define the tokenizer
141 tokenizer = AutoTokenizer.from_pretrained(
--> 142 configs.output_dir, do_lower_case=configs.do_lower_case)
143
144 # Evaluate the original FP32 BERT model
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
548 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
549 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 550 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
551 else:
552 if tokenizer_class_py is not None:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1752 use_auth_token=use_auth_token,
1753 cache_dir=cache_dir,
-> 1754 **kwargs,
1755 )
1756
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1776 copy.deepcopy(init_configuration),
1777 *init_inputs,
-> 1778 **(copy.deepcopy(kwargs)),
1779 )
1780 else:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1880 # Instantiate tokenizer.
1881 try:
-> 1882 tokenizer = cls(*init_inputs, **init_kwargs)
1883 except OSError:
1884 raise OSError(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/albert/tokenization_albert.py in __init__(self, vocab_file, do_lower_case, remove_space, keep_accents, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, sp_model_kwargs, **kwargs)
179
180 self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
--> 181 self.sp_model.Load(vocab_file)
182
183 @property
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in Load(self, model_file, model_proto)
365 if model_proto:
366 return self.LoadFromSerializedProto(model_proto)
--> 367 return self.LoadFromFile(model_file)
368
369
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in LoadFromFile(self, arg)
169
170 def LoadFromFile(self, arg):
--> 171 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
172
173 def DecodeIdsWithCheck(self, ids):
TypeError: not a string
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-12-1f864e3046eb> in <module>
140 # define the tokenizer
141 tokenizer = AutoTokenizer.from_pretrained(
--> 142 configs.output_dir, do_lower_case=configs.do_lower_case)
143
144 # Evaluate the original FP32 BERT model
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
548 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
549 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 550 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
551 else:
552 if tokenizer_class_py is not None:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1752 use_auth_token=use_auth_token,
1753 cache_dir=cache_dir,
-> 1754 **kwargs,
1755 )
1756
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1776 copy.deepcopy(init_configuration),
1777 *init_inputs,
-> 1778 **(copy.deepcopy(kwargs)),
1779 )
1780 else:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1880 # Instantiate tokenizer.
1881 try:
-> 1882 tokenizer = cls(*init_inputs, **init_kwargs)
1883 except OSError:
1884 raise OSError(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/albert/tokenization_albert.py in __init__(self, vocab_file, do_lower_case, remove_space, keep_accents, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, sp_model_kwargs, **kwargs)
179
180 self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
--> 181 self.sp_model.Load(vocab_file)
182
183 @property
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in Load(self, model_file, model_proto)
365 if model_proto:
366 return self.LoadFromSerializedProto(model_proto)
--> 367 return self.LoadFromFile(model_file)
368
369
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in LoadFromFile(self, arg)
169
170 def LoadFromFile(self, arg):
--> 171 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
172
173 def DecodeIdsWithCheck(self, ids):
TypeError: not a string
然后我不得不换掉模型名称的参数:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
第二部分详细介绍了这个。
目标:修改此 Notebook 以使用 albert-base-v2 模型
第 1.3 节出现错误。
内核:conda_pytorch_p36
。我做了 Restart & 运行 All,并刷新了工作目录中的文件视图。
列出了 3 种可能导致此错误的方法。我不确定我的情况属于哪个。
第 1.3 节:
# define the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
configs.output_dir, do_lower_case=configs.do_lower_case)
回溯:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-25-1f864e3046eb> in <module>
140 # define the tokenizer
141 tokenizer = AutoTokenizer.from_pretrained(
--> 142 configs.output_dir, do_lower_case=configs.do_lower_case)
143
144 # Evaluate the original FP32 BERT model
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
548 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
549 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 550 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
551 else:
552 if tokenizer_class_py is not None:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1752 use_auth_token=use_auth_token,
1753 cache_dir=cache_dir,
-> 1754 **kwargs,
1755 )
1756
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1880 # Instantiate tokenizer.
1881 try:
-> 1882 tokenizer = cls(*init_inputs, **init_kwargs)
1883 except OSError:
1884 raise OSError(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/albert/tokenization_albert_fast.py in __init__(self, vocab_file, tokenizer_file, do_lower_case, remove_space, keep_accents, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, **kwargs)
159 cls_token=cls_token,
160 mask_token=mask_token,
--> 161 **kwargs,
162 )
163
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_fast.py in __init__(self, *args, **kwargs)
116 else:
117 raise ValueError(
--> 118 "Couldn't instantiate the backend tokenizer from one of: \n"
119 "(1) a `tokenizers` library serialization file, \n"
120 "(2) a slow tokenizer instance to convert or \n"
ValueError: Couldn't instantiate the backend tokenizer from one of:
(1) a `tokenizers` library serialization file,
(2) a slow tokenizer instance to convert or
(3) an equivalent slow tokenizer class to instantiate and convert.
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.
如果我还有什么要补充的,请告诉我 post。
首先,我必须pip install sentencepiece
。
但是,在同一行代码中,我收到 sentencepiece
错误。
围绕两个参数包装 str()
产生相同的回溯。
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-12-1f864e3046eb> in <module>
140 # define the tokenizer
141 tokenizer = AutoTokenizer.from_pretrained(
--> 142 configs.output_dir, do_lower_case=configs.do_lower_case)
143
144 # Evaluate the original FP32 BERT model
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
548 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
549 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 550 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
551 else:
552 if tokenizer_class_py is not None:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1752 use_auth_token=use_auth_token,
1753 cache_dir=cache_dir,
-> 1754 **kwargs,
1755 )
1756
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1776 copy.deepcopy(init_configuration),
1777 *init_inputs,
-> 1778 **(copy.deepcopy(kwargs)),
1779 )
1780 else:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1880 # Instantiate tokenizer.
1881 try:
-> 1882 tokenizer = cls(*init_inputs, **init_kwargs)
1883 except OSError:
1884 raise OSError(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/albert/tokenization_albert.py in __init__(self, vocab_file, do_lower_case, remove_space, keep_accents, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, sp_model_kwargs, **kwargs)
179
180 self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
--> 181 self.sp_model.Load(vocab_file)
182
183 @property
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in Load(self, model_file, model_proto)
365 if model_proto:
366 return self.LoadFromSerializedProto(model_proto)
--> 367 return self.LoadFromFile(model_file)
368
369
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in LoadFromFile(self, arg)
169
170 def LoadFromFile(self, arg):
--> 171 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
172
173 def DecodeIdsWithCheck(self, ids):
TypeError: not a string
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-12-1f864e3046eb> in <module>
140 # define the tokenizer
141 tokenizer = AutoTokenizer.from_pretrained(
--> 142 configs.output_dir, do_lower_case=configs.do_lower_case)
143
144 # Evaluate the original FP32 BERT model
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
548 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
549 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 550 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
551 else:
552 if tokenizer_class_py is not None:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1752 use_auth_token=use_auth_token,
1753 cache_dir=cache_dir,
-> 1754 **kwargs,
1755 )
1756
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1776 copy.deepcopy(init_configuration),
1777 *init_inputs,
-> 1778 **(copy.deepcopy(kwargs)),
1779 )
1780 else:
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/tokenization_utils_base.py in _from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)
1880 # Instantiate tokenizer.
1881 try:
-> 1882 tokenizer = cls(*init_inputs, **init_kwargs)
1883 except OSError:
1884 raise OSError(
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/transformers/models/albert/tokenization_albert.py in __init__(self, vocab_file, do_lower_case, remove_space, keep_accents, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, sp_model_kwargs, **kwargs)
179
180 self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
--> 181 self.sp_model.Load(vocab_file)
182
183 @property
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in Load(self, model_file, model_proto)
365 if model_proto:
366 return self.LoadFromSerializedProto(model_proto)
--> 367 return self.LoadFromFile(model_file)
368
369
~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/sentencepiece/__init__.py in LoadFromFile(self, arg)
169
170 def LoadFromFile(self, arg):
--> 171 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
172
173 def DecodeIdsWithCheck(self, ids):
TypeError: not a string
然后我不得不换掉模型名称的参数:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
第二部分详细介绍了这个