如何解决 Transformer 模型 DistilBert 错误得到一个意外的关键字参数 'special_tokens_mask'
how to resolve Transformer model DistilBert error got an unexpected keyword argument 'special_tokens_mask'
我正在使用
苹果MacM1
OS: MacOS 蒙特雷
Python 3.10.4
我正在尝试按照此 tutorial
使用 DistilBERT 和 Weaviate 实施矢量搜索
下面是代码设置
import nltk
import os
import random
import time
import torch
import weaviate
from transformers import AutoModel, AutoTokenizer
from nltk.tokenize import sent_tokenize
torch.set_grad_enabled(False)
# udpated to use different model if desired
MODEL_NAME = "distilbert-base-uncased"
model = AutoModel.from_pretrained(MODEL_NAME)
model.to('cuda') # remove if working without GPUs
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# initialize nltk (for tokenizing sentences)
nltk.download('punkt')
# initialize weaviate client for importing and searching
client = weaviate.Client("http://localhost:8080")
def get_post_filenames(limit_objects=100):
file_names = []
i=0
for root, dirs, files in os.walk("./data/20news-bydate-test"):
for filename in files:
path = os.path.join(root, filename)
file_names += [path]
random.shuffle(file_names)
limit_objects = min(len(file_names), limit_objects)
file_names = file_names[:limit_objects]
return file_names
def read_posts(filenames=[]):
posts = []
for filename in filenames:
f = open(filename, encoding="utf-8", errors='ignore')
post = f.read()
# strip the headers (the first occurrence of two newlines)
post = post[post.find('\n\n'):]
# remove posts with less than 10 words to remove some of the noise
if len(post.split(' ')) < 10:
continue
post = post.replace('\n', ' ').replace('\t', ' ').strip()
if len(post) > 1000:
post = post[:1000]
posts += [post]
return posts
def text2vec(text):
tokens_pt = tokenizer(text, padding=True, truncation=True, max_length=500, add_special_tokens = True, return_tensors="pt")
tokens_pt.to('cuda') # remove if working without GPUs
outputs = model(**tokens_pt)
return outputs[0].mean(0).mean(0).detach()
def vectorize_posts(posts=[]):
post_vectors=[]
before=time.time()
for i, post in enumerate(posts):
vec=text2vec(sent_tokenize(post))
post_vectors += [vec]
if i % 100 == 0 and i != 0:
print("So far {} objects vectorized in {}s".format(i, time.time()-before))
after=time.time()
print("Vectorized {} items in {}s".format(len(posts), after-before))
return post_vectors
def init_weaviate_schema():
# a simple schema containing just a single class for our posts
schema = {
"classes": [{
"class": "Post",
"vectorizer": "none", # explicitly tell Weaviate not to vectorize anything, we are providing the vectors ourselves through our BERT model
"properties": [{
"name": "content",
"dataType": ["text"],
}]
}]
}
# cleanup from previous runs
client.schema.delete_all()
client.schema.create(schema)
def import_posts_with_vectors(posts, vectors, batchsize=256):
batch = weaviate.ObjectsBatchRequest()
for i, post in enumerate(posts):
props = {
"content": post,
}
batch.add(props, "Post", vector=vectors[i])
# when either batch size is reached or we are at the last object
if (i !=0 and i % batchsize == 0) or i == len(posts) - 1:
# send off the batch
client.batch.create(batch)
# and reset for the next batch
batch = weaviate.ObjectsBatchRequest()
def search(query="", limit=3):
before = time.time()
vec = text2vec(query)
vec_took = time.time() - before
before = time.time()
near_vec = {"vector": vec.tolist()}
res = client \
.query.get("Post", ["content", "_additional {certainty}"]) \
.with_near_vector(near_vec) \
.with_limit(limit) \
.do()
search_took = time.time() - before
print("\nQuery \"{}\" with {} results took {:.3f}s ({:.3f}s to vectorize and {:.3f}s to search)" \
.format(query, limit, vec_took+search_took, vec_took, search_took))
for post in res["data"]["Get"]["Post"]:
print("{:.4f}: {}".format(post["_additional"]["certainty"], post["content"]))
print('---')
# run everything
init_weaviate_schema()
posts = read_posts(get_post_filenames(4000))
vectors = vectorize_posts(posts)
import_posts_with_vectors(posts, vectors)
search("the best camera lens", 1)
search("which software do i need to view jpeg files", 1)
search("windows vs mac", 1)
以下函数触发错误
def text2vec(text):
# tokens_pt = tokenizer(text, padding=True, truncation=True, max_length=500, add_special_tokens = True, return_tensors="pt")
tokens_pt = tokenizer.encode_plus(text, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
tokens_pt.to('cuda') # remove if working without GPUs
outputs = model(**tokens_pt)
return outputs[0].mean(0).mean(0).detach()
error 1
tokens_pt.to('cuda') # remove if working without GPUs
AttributeError: 'dict' object has no attribute 'to'
当我注释掉 GPU 时
#tokens_pt.to('cuda')
和 运行 代码。我收到此错误
error 2
outputs = model(**tokens_pt)
File "/opt/homebrew/Caskroom/miniforge/base/envs/py310a/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
TypeError: DistilBertModel.forward() got an unexpected keyword argument 'special_tokens_mask'
是什么导致了这个错误,我该如何解决?
我无法在我的环境中重现您的错误 (Ubuntu),但据我所知,我建议尝试添加 return_special_tokens_mask=False
参数:
tokens_pt = tokenizer.encode_plus(
text,
add_special_tokens=True,
truncation=True,
padding="max_length",
return_attention_mask=True,
return_tensors="pt",
return_special_tokens_mask=False
)
如果失败,请尝试明确删除它:
tokens_pt.pop("special_tokens_mask")
我正在使用
苹果MacM1
OS: MacOS 蒙特雷
Python 3.10.4
我正在尝试按照此 tutorial
使用 DistilBERT 和 Weaviate 实施矢量搜索下面是代码设置
import nltk
import os
import random
import time
import torch
import weaviate
from transformers import AutoModel, AutoTokenizer
from nltk.tokenize import sent_tokenize
torch.set_grad_enabled(False)
# udpated to use different model if desired
MODEL_NAME = "distilbert-base-uncased"
model = AutoModel.from_pretrained(MODEL_NAME)
model.to('cuda') # remove if working without GPUs
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# initialize nltk (for tokenizing sentences)
nltk.download('punkt')
# initialize weaviate client for importing and searching
client = weaviate.Client("http://localhost:8080")
def get_post_filenames(limit_objects=100):
file_names = []
i=0
for root, dirs, files in os.walk("./data/20news-bydate-test"):
for filename in files:
path = os.path.join(root, filename)
file_names += [path]
random.shuffle(file_names)
limit_objects = min(len(file_names), limit_objects)
file_names = file_names[:limit_objects]
return file_names
def read_posts(filenames=[]):
posts = []
for filename in filenames:
f = open(filename, encoding="utf-8", errors='ignore')
post = f.read()
# strip the headers (the first occurrence of two newlines)
post = post[post.find('\n\n'):]
# remove posts with less than 10 words to remove some of the noise
if len(post.split(' ')) < 10:
continue
post = post.replace('\n', ' ').replace('\t', ' ').strip()
if len(post) > 1000:
post = post[:1000]
posts += [post]
return posts
def text2vec(text):
tokens_pt = tokenizer(text, padding=True, truncation=True, max_length=500, add_special_tokens = True, return_tensors="pt")
tokens_pt.to('cuda') # remove if working without GPUs
outputs = model(**tokens_pt)
return outputs[0].mean(0).mean(0).detach()
def vectorize_posts(posts=[]):
post_vectors=[]
before=time.time()
for i, post in enumerate(posts):
vec=text2vec(sent_tokenize(post))
post_vectors += [vec]
if i % 100 == 0 and i != 0:
print("So far {} objects vectorized in {}s".format(i, time.time()-before))
after=time.time()
print("Vectorized {} items in {}s".format(len(posts), after-before))
return post_vectors
def init_weaviate_schema():
# a simple schema containing just a single class for our posts
schema = {
"classes": [{
"class": "Post",
"vectorizer": "none", # explicitly tell Weaviate not to vectorize anything, we are providing the vectors ourselves through our BERT model
"properties": [{
"name": "content",
"dataType": ["text"],
}]
}]
}
# cleanup from previous runs
client.schema.delete_all()
client.schema.create(schema)
def import_posts_with_vectors(posts, vectors, batchsize=256):
batch = weaviate.ObjectsBatchRequest()
for i, post in enumerate(posts):
props = {
"content": post,
}
batch.add(props, "Post", vector=vectors[i])
# when either batch size is reached or we are at the last object
if (i !=0 and i % batchsize == 0) or i == len(posts) - 1:
# send off the batch
client.batch.create(batch)
# and reset for the next batch
batch = weaviate.ObjectsBatchRequest()
def search(query="", limit=3):
before = time.time()
vec = text2vec(query)
vec_took = time.time() - before
before = time.time()
near_vec = {"vector": vec.tolist()}
res = client \
.query.get("Post", ["content", "_additional {certainty}"]) \
.with_near_vector(near_vec) \
.with_limit(limit) \
.do()
search_took = time.time() - before
print("\nQuery \"{}\" with {} results took {:.3f}s ({:.3f}s to vectorize and {:.3f}s to search)" \
.format(query, limit, vec_took+search_took, vec_took, search_took))
for post in res["data"]["Get"]["Post"]:
print("{:.4f}: {}".format(post["_additional"]["certainty"], post["content"]))
print('---')
# run everything
init_weaviate_schema()
posts = read_posts(get_post_filenames(4000))
vectors = vectorize_posts(posts)
import_posts_with_vectors(posts, vectors)
search("the best camera lens", 1)
search("which software do i need to view jpeg files", 1)
search("windows vs mac", 1)
以下函数触发错误
def text2vec(text):
# tokens_pt = tokenizer(text, padding=True, truncation=True, max_length=500, add_special_tokens = True, return_tensors="pt")
tokens_pt = tokenizer.encode_plus(text, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
tokens_pt.to('cuda') # remove if working without GPUs
outputs = model(**tokens_pt)
return outputs[0].mean(0).mean(0).detach()
error 1
tokens_pt.to('cuda') # remove if working without GPUs AttributeError: 'dict' object has no attribute 'to'
当我注释掉 GPU 时
#tokens_pt.to('cuda')
和 运行 代码。我收到此错误
error 2
outputs = model(**tokens_pt) File "/opt/homebrew/Caskroom/miniforge/base/envs/py310a/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl return forward_call(*input, **kwargs) TypeError: DistilBertModel.forward() got an unexpected keyword argument 'special_tokens_mask'
是什么导致了这个错误,我该如何解决?
我无法在我的环境中重现您的错误 (Ubuntu),但据我所知,我建议尝试添加 return_special_tokens_mask=False
参数:
tokens_pt = tokenizer.encode_plus(
text,
add_special_tokens=True,
truncation=True,
padding="max_length",
return_attention_mask=True,
return_tensors="pt",
return_special_tokens_mask=False
)
如果失败,请尝试明确删除它:
tokens_pt.pop("special_tokens_mask")