Python Spacy KeyError: "[E018] Can't retrieve string for hash
Python Spacy KeyError: "[E018] Can't retrieve string for hash
我正在尝试在 Raspberry Pi 4 上编写我的代码 运行 并且在这个错误上停留了几个小时。此代码段会引发错误,但 运行 在 windows 上与同一个项目
完全一致
def create_lem_texts(data): # 作为列表
def sent_to_words(句子):
对于句子中的句子:
yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True 去除标点符号
data_words = list(sent_to_words(data))
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
print(os.getcwd())
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
print(os.getcwd())
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
return data_lemmatized
这段代码被这个函数依次调用:
def assign_topics_tweet(tweets):
owd = os.getcwd()
print(owd)
os.chdir('/home/pi/Documents/pycharm_project_twitter/topic_model/')
print(os.getcwd())
lda = LdaModel.load("LDA26")
print(lda)
id2word = Dictionary.load('Id2Word')
print(id2word)
os.chdir(owd)
data = create_lem_texts(tweets)
corpus = [id2word.doc2bow(text) for text in data]
topics = []
for tweet in corpus:
topics_dist = lda.get_document_topics(tweet)
topics.append(topics_dist)
return topics
这里是错误信息
Traceback (most recent call last):
File "/home/pi/Documents/pycharm_project_twitter/Twitter_Import.py", line 193, in <module>
main()
File "/home/pi/Documents/pycharm_project_twitter/Twitter_Import.py", line 169, in main
topics = assign_topics_tweet(data)
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 238, in assign_topics_tweet
data = create_lem_texts(tweets)
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 76, in create_lem_texts
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 67, in lemmatization
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 67, in <listcomp>
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
File "token.pyx", line 871, in spacy.tokens.token.Token.lemma_.__get__
File "strings.pyx", line 136, in spacy.strings.StringStore.__getitem__
KeyError: "[E018] Can't retrieve string for hash '18446744073541552667'. This usually refers to an issue with the `Vocab` or `StringStore`."
Process finished with exit code 1
我尝试重新安装 spacy 和 en 模型,运行直接在 pi 上安装它,spacy 版本在我的 windows 机器和 Pi 上都是相同的。而且网上基本没有关于这个错误的资料
经过三天的测试,只需安装旧版本的 Spacy 2.0.1 即可解决问题
我正在尝试在 Raspberry Pi 4 上编写我的代码 运行 并且在这个错误上停留了几个小时。此代码段会引发错误,但 运行 在 windows 上与同一个项目
完全一致def create_lem_texts(data): # 作为列表 def sent_to_words(句子): 对于句子中的句子: yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True 去除标点符号
data_words = list(sent_to_words(data))
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
print(os.getcwd())
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
print(os.getcwd())
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
return data_lemmatized
这段代码被这个函数依次调用:
def assign_topics_tweet(tweets):
owd = os.getcwd()
print(owd)
os.chdir('/home/pi/Documents/pycharm_project_twitter/topic_model/')
print(os.getcwd())
lda = LdaModel.load("LDA26")
print(lda)
id2word = Dictionary.load('Id2Word')
print(id2word)
os.chdir(owd)
data = create_lem_texts(tweets)
corpus = [id2word.doc2bow(text) for text in data]
topics = []
for tweet in corpus:
topics_dist = lda.get_document_topics(tweet)
topics.append(topics_dist)
return topics
这里是错误信息
Traceback (most recent call last):
File "/home/pi/Documents/pycharm_project_twitter/Twitter_Import.py", line 193, in <module>
main()
File "/home/pi/Documents/pycharm_project_twitter/Twitter_Import.py", line 169, in main
topics = assign_topics_tweet(data)
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 238, in assign_topics_tweet
data = create_lem_texts(tweets)
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 76, in create_lem_texts
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 67, in lemmatization
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
File "/home/pi/Documents/pycharm_project_twitter/TopicModel.py", line 67, in <listcomp>
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
File "token.pyx", line 871, in spacy.tokens.token.Token.lemma_.__get__
File "strings.pyx", line 136, in spacy.strings.StringStore.__getitem__
KeyError: "[E018] Can't retrieve string for hash '18446744073541552667'. This usually refers to an issue with the `Vocab` or `StringStore`."
Process finished with exit code 1
我尝试重新安装 spacy 和 en 模型,运行直接在 pi 上安装它,spacy 版本在我的 windows 机器和 Pi 上都是相同的。而且网上基本没有关于这个错误的资料
经过三天的测试,只需安装旧版本的 Spacy 2.0.1 即可解决问题