创建余弦相似度矩阵 numpy
create cosine similarity matrix numpy
假设我有一个如下所示的 numpy 矩阵:
array([array([ 0.0072427 , 0.00669255, 0.00785213, 0.00845336, 0.01042869]),
array([ 0.00710799, 0.00668831, 0.00772334, 0.00777796, 0.01049965]),
array([ 0.00741872, 0.00650899, 0.00772273, 0.00729002, 0.00919407]),
array([ 0.00717589, 0.00627021, 0.0069514 , 0.0079332 , 0.01069545]),
array([ 0.00617369, 0.00590539, 0.00738468, 0.00761699, 0.00886915])], dtype=object)
如何生成一个 5 x 5 矩阵,其中矩阵的每个索引都是原始矩阵中两个对应行的余弦相似度?
例如第 0 行第 2 列的值将是原始矩阵中第 1 行和第 3 行之间的余弦相似度。
这是我尝试过的方法:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
import numpy as np
#features is a column in my artist_meta data frame
#where each value is a numpy array of 5 floating point values, similar to the
#form of the matrix referenced above but larger in volume
items_mat = np.array(artist_meta['features'].values)
dist_out = 1-pairwise_distances(items_mat, metric="cosine")
上面的代码给我以下错误:
ValueError: 使用序列设置数组元素。
不确定为什么我会得到这个,因为每个数组的长度相同 (5),我已经验证过。
让x
成为你的数组
from scipy.spatial.distance import cosine
m, n = x.shape
distances = np.zeros((m,n))
for i in range(m):
for j in range(n):
distances[i,j] = cosine(x[i,:],x[:,j])
设m
为数组
m = np.array([
[ 0.0072427 , 0.00669255, 0.00785213, 0.00845336, 0.01042869],
[ 0.00710799, 0.00668831, 0.00772334, 0.00777796, 0.01049965],
[ 0.00741872, 0.00650899, 0.00772273, 0.00729002, 0.00919407],
[ 0.00717589, 0.00627021, 0.0069514 , 0.0079332 , 0.01069545],
[ 0.00617369, 0.00590539, 0.00738468, 0.00761699, 0.00886915]
])
per wikipedia: Cosine_Similarity
我们可以用
计算我们的分子
d = m.T @ m
我们的‖A‖
是
norm = (m * m).sum(0, keepdims=True) ** .5
那么相同点是
d / norm / norm.T
[[ 1. 0.9994 0.9979 0.9973 0.9977]
[ 0.9994 1. 0.9993 0.9985 0.9981]
[ 0.9979 0.9993 1. 0.998 0.9958]
[ 0.9973 0.9985 0.998 1. 0.9985]
[ 0.9977 0.9981 0.9958 0.9985 1. ]]
距离是
1 - d / norm / norm.T
[[ 0. 0.0006 0.0021 0.0027 0.0023]
[ 0.0006 0. 0.0007 0.0015 0.0019]
[ 0.0021 0.0007 0. 0.002 0.0042]
[ 0.0027 0.0015 0.002 0. 0.0015]
[ 0.0023 0.0019 0.0042 0.0015 0. ]]
如前所述,您可以使用 sklearn 中的 pairwise
函数。这是一个完整的实现以及它与 sklearn
和 scipy
版本匹配的验证。我在这个例子中四舍五入到小数点后四位。
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
def cosine_distance_matrix(column: pd.Series, decimals: int = 4):
"""
Calculate cosine distance of column against itself (pairwise)
Args:
column:
pandas series containing np.array values
decimals:
how many places to round the output
Returns:
distance matrix of shape (len(column), len(column))
"""
M = np.vstack(column.values)
# Perform division by magnitude of pairs first
# M / (||A|| * ||B||)
M_norm = M / np.sqrt(np.square(M).sum(1, keepdims=True))
# Perform dot product
similarity = M_norm @ M_norm.T
# Convert from similarity to distance
return (1 - similarity).round(decimals)
# Example for testing
sample_column = pd.Series([
np.array([3, 4]),
np.array([7, 24]),
np.array([1, 1])
])
# Try our own fast implementation
custom_version = cosine_distance_matrix(sample_column, decimals=4)
# Use pairwise function from sklearn
pairwise_version = pairwise_distances(
np.vstack(sample_column.values),
metric="cosine"
).round(4)
# Equals pairwise version
assert (custom_version == pairwise_version).all()
# Check single element
assert custom_version[0, 1] == cosine(sample_column[0], sample_column[1]).round(4)
假设我有一个如下所示的 numpy 矩阵:
array([array([ 0.0072427 , 0.00669255, 0.00785213, 0.00845336, 0.01042869]),
array([ 0.00710799, 0.00668831, 0.00772334, 0.00777796, 0.01049965]),
array([ 0.00741872, 0.00650899, 0.00772273, 0.00729002, 0.00919407]),
array([ 0.00717589, 0.00627021, 0.0069514 , 0.0079332 , 0.01069545]),
array([ 0.00617369, 0.00590539, 0.00738468, 0.00761699, 0.00886915])], dtype=object)
如何生成一个 5 x 5 矩阵,其中矩阵的每个索引都是原始矩阵中两个对应行的余弦相似度?
例如第 0 行第 2 列的值将是原始矩阵中第 1 行和第 3 行之间的余弦相似度。
这是我尝试过的方法:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
import numpy as np
#features is a column in my artist_meta data frame
#where each value is a numpy array of 5 floating point values, similar to the
#form of the matrix referenced above but larger in volume
items_mat = np.array(artist_meta['features'].values)
dist_out = 1-pairwise_distances(items_mat, metric="cosine")
上面的代码给我以下错误:
ValueError: 使用序列设置数组元素。
不确定为什么我会得到这个,因为每个数组的长度相同 (5),我已经验证过。
让x
成为你的数组
from scipy.spatial.distance import cosine
m, n = x.shape
distances = np.zeros((m,n))
for i in range(m):
for j in range(n):
distances[i,j] = cosine(x[i,:],x[:,j])
设m
为数组
m = np.array([
[ 0.0072427 , 0.00669255, 0.00785213, 0.00845336, 0.01042869],
[ 0.00710799, 0.00668831, 0.00772334, 0.00777796, 0.01049965],
[ 0.00741872, 0.00650899, 0.00772273, 0.00729002, 0.00919407],
[ 0.00717589, 0.00627021, 0.0069514 , 0.0079332 , 0.01069545],
[ 0.00617369, 0.00590539, 0.00738468, 0.00761699, 0.00886915]
])
per wikipedia: Cosine_Similarity
我们可以用
计算我们的分子d = m.T @ m
我们的‖A‖
是
norm = (m * m).sum(0, keepdims=True) ** .5
那么相同点是
d / norm / norm.T
[[ 1. 0.9994 0.9979 0.9973 0.9977]
[ 0.9994 1. 0.9993 0.9985 0.9981]
[ 0.9979 0.9993 1. 0.998 0.9958]
[ 0.9973 0.9985 0.998 1. 0.9985]
[ 0.9977 0.9981 0.9958 0.9985 1. ]]
距离是
1 - d / norm / norm.T
[[ 0. 0.0006 0.0021 0.0027 0.0023]
[ 0.0006 0. 0.0007 0.0015 0.0019]
[ 0.0021 0.0007 0. 0.002 0.0042]
[ 0.0027 0.0015 0.002 0. 0.0015]
[ 0.0023 0.0019 0.0042 0.0015 0. ]]
如前所述,您可以使用 sklearn 中的 pairwise
函数。这是一个完整的实现以及它与 sklearn
和 scipy
版本匹配的验证。我在这个例子中四舍五入到小数点后四位。
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise_distances
def cosine_distance_matrix(column: pd.Series, decimals: int = 4):
"""
Calculate cosine distance of column against itself (pairwise)
Args:
column:
pandas series containing np.array values
decimals:
how many places to round the output
Returns:
distance matrix of shape (len(column), len(column))
"""
M = np.vstack(column.values)
# Perform division by magnitude of pairs first
# M / (||A|| * ||B||)
M_norm = M / np.sqrt(np.square(M).sum(1, keepdims=True))
# Perform dot product
similarity = M_norm @ M_norm.T
# Convert from similarity to distance
return (1 - similarity).round(decimals)
# Example for testing
sample_column = pd.Series([
np.array([3, 4]),
np.array([7, 24]),
np.array([1, 1])
])
# Try our own fast implementation
custom_version = cosine_distance_matrix(sample_column, decimals=4)
# Use pairwise function from sklearn
pairwise_version = pairwise_distances(
np.vstack(sample_column.values),
metric="cosine"
).round(4)
# Equals pairwise version
assert (custom_version == pairwise_version).all()
# Check single element
assert custom_version[0, 1] == cosine(sample_column[0], sample_column[1]).round(4)