让相似的词不再适用于spacy
Getting similar words no longer working in spacy
我有一个 Google 不久前使用 spacy 2.2.4 的 Colab 笔记本,并成功地为单词列表获取了最相似的单词:
import spacy
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()
import numpy as np
import pandas as pd
print(spacy.__version__)
all_search_terms = ["technology", "internet", "smartphone"]
# define a function to get the x most similar words to a word
def most_similar(word, topn=2):
print(word)
word = nlp.vocab[str(word)]
print(word.prob)
queries = [
w for w in word.vocab
if w.is_lower == word.is_lower and w.prob >= -15 and np.count_nonzero(w.vector)
]
by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]
# create function to receive a list of words and return the
# top 2 similar words for each word in the list
def get_similar_words(list_of_words):
all_similar_words = []
for word in list_of_words:
spacy_word = nlp.vocab[str(word)]
if spacy_word.has_vector:
# find similar words to the word, and store them in a dataframe along with their scores
similar_words = pd.DataFrame(most_similar(word, topn=2), columns=["word", "similarity_score"])
# save the list of similar words
similar_words_list = list(similar_words["word"])
# append the list of similar words to the list to be returned
all_similar_words.append(similar_words_list)
# flatten the list of lists to one list
all_similar_words = [item for sublist in all_similar_words for item in sublist]
# remove duplicates from the list
all_similar_words = list(dict.fromkeys(all_similar_words))
# sort list in alphabetical order
all_similar_words.sort()
return all_similar_words
# run the function on the search terms entered by the user
new_search_terms = get_similar_words(all_search_terms)
new_search_terms
输出为:
technology
-10.063644409179688
internet
-8.897857666015625
smartphone
-12.11159896850586
['handset', 'online', 'smartphones', 'technological', 'technologies', 'web']
问题: 我刚刚在 RStudio 的不同环境中尝试 运行 相同的代码(即不使用 Google Colab),其中spacy 的版本是 3.0.6,相似词列表 (new_search_terms) 是 empty。我还注意到 这个词的概率都是一样的 (-20).
spacy 3.0.6 的输出:
technology
-20.0
internet
-20.0
smartphone
-20.0
[]
我需要在这个新版本的 spacy 中做哪些不同的事情才能获得与以前相同的输出?
令牌概率在 v3 中默认不加载,因此您必须 do some stuff 加载它们。
import spacy
from spacy.lookups import load_lookups
nlp = spacy.load("en_core_web_sm")
lookups = load_lookups("en", ["lexeme_prob"])
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
在此之后你的代码应该可以工作了,虽然我不确定你为什么在这里使用 .prob
。
我有一个 Google 不久前使用 spacy 2.2.4 的 Colab 笔记本,并成功地为单词列表获取了最相似的单词:
import spacy
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()
import numpy as np
import pandas as pd
print(spacy.__version__)
all_search_terms = ["technology", "internet", "smartphone"]
# define a function to get the x most similar words to a word
def most_similar(word, topn=2):
print(word)
word = nlp.vocab[str(word)]
print(word.prob)
queries = [
w for w in word.vocab
if w.is_lower == word.is_lower and w.prob >= -15 and np.count_nonzero(w.vector)
]
by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
return [(w.lower_,w.similarity(word)) for w in by_similarity[:topn+1] if w.lower_ != word.lower_]
# create function to receive a list of words and return the
# top 2 similar words for each word in the list
def get_similar_words(list_of_words):
all_similar_words = []
for word in list_of_words:
spacy_word = nlp.vocab[str(word)]
if spacy_word.has_vector:
# find similar words to the word, and store them in a dataframe along with their scores
similar_words = pd.DataFrame(most_similar(word, topn=2), columns=["word", "similarity_score"])
# save the list of similar words
similar_words_list = list(similar_words["word"])
# append the list of similar words to the list to be returned
all_similar_words.append(similar_words_list)
# flatten the list of lists to one list
all_similar_words = [item for sublist in all_similar_words for item in sublist]
# remove duplicates from the list
all_similar_words = list(dict.fromkeys(all_similar_words))
# sort list in alphabetical order
all_similar_words.sort()
return all_similar_words
# run the function on the search terms entered by the user
new_search_terms = get_similar_words(all_search_terms)
new_search_terms
输出为:
technology
-10.063644409179688
internet
-8.897857666015625
smartphone
-12.11159896850586
['handset', 'online', 'smartphones', 'technological', 'technologies', 'web']
问题: 我刚刚在 RStudio 的不同环境中尝试 运行 相同的代码(即不使用 Google Colab),其中spacy 的版本是 3.0.6,相似词列表 (new_search_terms) 是 empty。我还注意到 这个词的概率都是一样的 (-20).
spacy 3.0.6 的输出:
technology
-20.0
internet
-20.0
smartphone
-20.0
[]
我需要在这个新版本的 spacy 中做哪些不同的事情才能获得与以前相同的输出?
令牌概率在 v3 中默认不加载,因此您必须 do some stuff 加载它们。
import spacy
from spacy.lookups import load_lookups
nlp = spacy.load("en_core_web_sm")
lookups = load_lookups("en", ["lexeme_prob"])
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
在此之后你的代码应该可以工作了,虽然我不确定你为什么在这里使用 .prob
。