使用 Word Mover Distance 和 Bert-Embedding 记录相似度
Document similarity with Word Mover Distance and Bert-Embedding
我正在尝试使用基于 Google's BERT 的词嵌入来计算两个任意文档的文档相似度(最近邻)。
为了从 Bert 获得词嵌入,我使用 bert-as-a-service。
文档相似度应基于 python wmd-relax 包的 Word-Mover-Distance。
我之前的尝试是根据 wmd-relax
github 存储库中的本教程进行的:https://github.com/src-d/wmd-relax/blob/master/spacy_example.py
import numpy as np
import spacy
import requests
from wmd import WMD
from collections import Counter
from bert_serving.client import BertClient
# Wikipedia titles
titles = ["Germany", "Spain", "Google", "Apple"]
# Standard model from spacy
nlp = spacy.load("en_vectors_web_lg")
# Fetch wiki articles and prepare as specy document
documents_spacy = {}
print('Create spacy document')
for title in titles:
print("... fetching", title)
pages = requests.get(
"https://en.wikipedia.org/w/api.php?action=query&format=json&titles=%s"
"&prop=extracts&explaintext" % title).json()["query"]["pages"]
text = nlp(next(iter(pages.values()))["extract"])
tokens = [t for t in text if t.is_alpha and not t.is_stop]
words = Counter(t.text for t in tokens)
orths = {t.text: t.orth for t in tokens}
sorted_words = sorted(words)
documents_spacy[title] = (title, [orths[t] for t in sorted_words],
np.array([words[t] for t in sorted_words],
dtype=np.float32))
# This is the original embedding class with the model from spacy
class SpacyEmbeddings(object):
def __getitem__(self, item):
return nlp.vocab[item].vector
# Bert Embeddings using bert-as-as-service
class BertEmbeddings:
def __init__(self, ip='localhost', port=5555, port_out=5556):
self.server = BertClient(ip=ip, port=port, port_out=port_out)
def __getitem__(self, item):
text = nlp.vocab[item].text
emb = self.server.encode([text])
return emb
# Get the nearest neighbor of one of the atricles
calc_bert = WMD(BertEmbeddings(), documents_spacy)
calc_bert.nearest_neighbors(titles[0])
不幸的是,计算失败,距离计算中的尺寸不匹配:
ValueError: shapes (812,1,768) and (768,1,812) not aligned: 768 (dim 2) != 1 (dim 1)
bert-as-service
输出的形状是 (batch_size, sequence_len, embedding_dimension。在您的情况下,sequence_len 是 1,因为您正在汇集结果。
现在,您可以使用 numpy.ndarray
的 transpose
方法转置另一个以与此匹配。
我正在尝试使用基于 Google's BERT 的词嵌入来计算两个任意文档的文档相似度(最近邻)。 为了从 Bert 获得词嵌入,我使用 bert-as-a-service。 文档相似度应基于 python wmd-relax 包的 Word-Mover-Distance。
我之前的尝试是根据 wmd-relax
github 存储库中的本教程进行的:https://github.com/src-d/wmd-relax/blob/master/spacy_example.py
import numpy as np
import spacy
import requests
from wmd import WMD
from collections import Counter
from bert_serving.client import BertClient
# Wikipedia titles
titles = ["Germany", "Spain", "Google", "Apple"]
# Standard model from spacy
nlp = spacy.load("en_vectors_web_lg")
# Fetch wiki articles and prepare as specy document
documents_spacy = {}
print('Create spacy document')
for title in titles:
print("... fetching", title)
pages = requests.get(
"https://en.wikipedia.org/w/api.php?action=query&format=json&titles=%s"
"&prop=extracts&explaintext" % title).json()["query"]["pages"]
text = nlp(next(iter(pages.values()))["extract"])
tokens = [t for t in text if t.is_alpha and not t.is_stop]
words = Counter(t.text for t in tokens)
orths = {t.text: t.orth for t in tokens}
sorted_words = sorted(words)
documents_spacy[title] = (title, [orths[t] for t in sorted_words],
np.array([words[t] for t in sorted_words],
dtype=np.float32))
# This is the original embedding class with the model from spacy
class SpacyEmbeddings(object):
def __getitem__(self, item):
return nlp.vocab[item].vector
# Bert Embeddings using bert-as-as-service
class BertEmbeddings:
def __init__(self, ip='localhost', port=5555, port_out=5556):
self.server = BertClient(ip=ip, port=port, port_out=port_out)
def __getitem__(self, item):
text = nlp.vocab[item].text
emb = self.server.encode([text])
return emb
# Get the nearest neighbor of one of the atricles
calc_bert = WMD(BertEmbeddings(), documents_spacy)
calc_bert.nearest_neighbors(titles[0])
不幸的是,计算失败,距离计算中的尺寸不匹配:
ValueError: shapes (812,1,768) and (768,1,812) not aligned: 768 (dim 2) != 1 (dim 1)
bert-as-service
输出的形状是 (batch_size, sequence_len, embedding_dimension。在您的情况下,sequence_len 是 1,因为您正在汇集结果。
现在,您可以使用 numpy.ndarray
的 transpose
方法转置另一个以与此匹配。