如何取两个字典值来求 Python 中的余弦相似度？

Question

我需要使用存储在 user_dict 字典中的“用户评分”从两个向量中找到余弦距离相似度。

评级是从 CSV 文件导入的，然后更改为以用户为键的字典，其中包含每个用户评级的值。我的问题是如何循环遍历字典以获取两个用户的评分并使用余弦距离函数获得相似度？

循环不需要将同一用户放在一起比较或以不同的顺序比较相同的用户？（例如，用户 5 与用户 3 和用户 3 与用户 5）

Answer 1

您可以使用 itertools.combinations（dct 是您的输入字典）：

from itertools import combinations

for k1, k2 in combinations(dct.keys(), 2):
    # compute cosine similarity between dct[k1] and dct[k2]
    ...

Answer 2

from scipy import spatial

d = {'U1': [3, 4, 2, 5, 0, 4, 1, 3, 0, 0, 4],
'U2': [2, 3, 1, 0, 3, 0, 2, 0, 0, 3, 0], 
'U3': [0, 4, 0, 5, 0, 4, 0, 3, 0, 2, 4], 
'U4': [0, 0, 2, 1, 4, 3, 2, 0, 0, 2, 0], 
'U5': [0, 0, 0, 5, 0, 4, 0, 3, 0, 0, 4],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0], 
'U7': [0, 4, 3, 5, 0, 5, 0, 0, 0, 0, 4], 
'U8': [4, 3, 0, 3, 4, 2, 2, 0, 2, 3, 2],
'U9': [0, 2, 0, 3, 1, 0, 1, 0, 0, 2, 0], 
'U10': [0, 3, 0, 4, 3, 3, 0, 3, 0, 4, 4],
'U11': [2, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2],
'U12': [0, 4, 4, 5, 0, 0, 0, 3, 0, 4, 5],
'U13': [3, 3, 0, 2, 2, 3, 2, 0, 2, 0, 3],
'U14': [0, 3, 4, 5, 0, 5, 0, 0, 0, 4, 0],
'U15': [2, 0, 0, 3, 0, 2, 2, 3, 0, 0, 3], 
'U16': [4, 4, 0, 4, 3, 4, 0, 3, 0, 3, 0], 
'U17': [0, 2, 0, 3, 1, 0, 2, 0, 1, 0, 3],
'U18': [2, 3, 1, 0, 3, 2, 3, 2, 0, 2, 0], 
'U19': [0, 5, 0, 4, 0, 3, 0, 4, 0, 0, 5],
'U20': [0, 0, 3, 0, 3, 0, 4, 0, 2, 0, 0],
'U21': [3, 0, 2, 4, 2, 3, 0, 4, 2, 3, 3], 
'U22': [4, 4, 0, 5, 3, 5, 0, 4, 0, 3, 0],
'U23': [3, 0, 0, 0, 3, 0, 2, 0, 0, 4, 0], 
'U24': [4, 0, 3, 0, 3, 0, 3, 0, 0, 2, 2], 
'U25': [0, 5, 0, 3, 3, 4, 0, 3, 3, 4, 4]}

all_keys = list(d.keys())

for i in range(len(all_keys)):
    for j in range(i+1,len(all_keys)):
        print(f"Cosine similaity between {all_keys[i]} and {all_keys[j]} is {1 - spatial.distance.cosine(d[all_keys[i]], d[all_keys[j]])}")

或

使用pandas

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

d = {'U1': [3, 4, 2, 5, 0, 4, 1, 3, 0, 0, 4],
'U2': [2, 3, 1, 0, 3, 0, 2, 0, 0, 3, 0], 
'U3': [0, 4, 0, 5, 0, 4, 0, 3, 0, 2, 4], 
'U4': [0, 0, 2, 1, 4, 3, 2, 0, 0, 2, 0], 
'U5': [0, 0, 0, 5, 0, 4, 0, 3, 0, 0, 4],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0], 
'U7': [0, 4, 3, 5, 0, 5, 0, 0, 0, 0, 4], 
'U8': [4, 3, 0, 3, 4, 2, 2, 0, 2, 3, 2],
'U9': [0, 2, 0, 3, 1, 0, 1, 0, 0, 2, 0], 
'U10': [0, 3, 0, 4, 3, 3, 0, 3, 0, 4, 4],
'U11': [2, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2],
'U12': [0, 4, 4, 5, 0, 0, 0, 3, 0, 4, 5],
'U13': [3, 3, 0, 2, 2, 3, 2, 0, 2, 0, 3],
'U14': [0, 3, 4, 5, 0, 5, 0, 0, 0, 4, 0],
'U15': [2, 0, 0, 3, 0, 2, 2, 3, 0, 0, 3], 
'U16': [4, 4, 0, 4, 3, 4, 0, 3, 0, 3, 0], 
'U17': [0, 2, 0, 3, 1, 0, 2, 0, 1, 0, 3],
'U18': [2, 3, 1, 0, 3, 2, 3, 2, 0, 2, 0], 
'U19': [0, 5, 0, 4, 0, 3, 0, 4, 0, 0, 5],
'U20': [0, 0, 3, 0, 3, 0, 4, 0, 2, 0, 0],
'U21': [3, 0, 2, 4, 2, 3, 0, 4, 2, 3, 3], 
'U22': [4, 4, 0, 5, 3, 5, 0, 4, 0, 3, 0],
'U23': [3, 0, 0, 0, 3, 0, 2, 0, 0, 4, 0], 
'U24': [4, 0, 3, 0, 3, 0, 3, 0, 0, 2, 2], 
'U25': [0, 5, 0, 3, 3, 4, 0, 3, 3, 4, 4]}

df = pd.DataFrame(d)
cos_df = pd.DataFrame(cosine_similarity(df.T), columns = df.columns)
cos_df.insert(0,"Columns",df.columns)
print(cos_df)

输出：

  Columns        U1        U2        U3        U4  ...       U21       U22       U23       U24       U25
0       U1  1.000000  0.374228  0.902462  0.380803  ...  0.787351  0.805479  0.182123  0.414455  0.742959
1       U2  0.374228  1.000000  0.323498  0.648886  ...  0.428580  0.588035  0.838144  0.746816  0.574696
2       U3  0.902462  0.323498  1.000000  0.367348  ...  0.747476  0.790950  0.139942  0.181195  0.867595
3       U4  0.380803  0.648886  0.367348  1.000000  ...  0.562244  0.572351  0.631579  0.636035  0.543830
4       U5  0.829156  0.000000  0.876038  0.339457  ...  0.770675  0.651439  0.000000  0.137890  0.660241
5       U6  0.348816  0.864242  0.254164  0.650011  ...  0.500694  0.448630  0.707365  0.759113  0.553116
6       U7  0.888018  0.262071  0.870404  0.442141  ...  0.621170  0.642383  0.000000  0.249542  0.712893
7       U8  0.671751  0.808290  0.610121  0.655610  ...  0.735867  0.793363  0.749269  0.711438  0.774202
8       U9  0.561951  0.650011  0.667940  0.483810  ...  0.512989  0.681623  0.483810  0.321246  0.659221
9      U10  0.768376  0.545545  0.905945  0.584094  ...  0.817316  0.810441  0.442495  0.381958  0.930116
10     U11  0.766131  0.625543  0.584602  0.405906  ...  0.606128  0.561442  0.439732  0.700749  0.599162
11     U12  0.769604  0.451144  0.813118  0.329333  ...  0.724166  0.583435  0.250921  0.406111  0.740772
12     U13  0.806747  0.577813  0.687871  0.517409  ...  0.666687  0.708161  0.427425  0.582552  0.757112
13     U14  0.695436  0.436785  0.734756  0.612195  ...  0.644610  0.720248  0.272087  0.293578  0.662689
14     U15  0.849837  0.213504  0.759751  0.337691  ...  0.805629  0.669039  0.259762  0.448449  0.582825
15     U16  0.781028  0.663914  0.757364  0.578184  ...  0.785252  0.992774  0.561179  0.455047  0.783178
16     U17  0.713653  0.409462  0.713247  0.337227  ...  0.528221  0.456211  0.214599  0.396942  0.669745
17     U18  0.569298  0.879408  0.487692  0.733674  ...  0.573070  0.741858  0.709218  0.696631  0.664230
18     U19  0.898717  0.262071  0.949531  0.221071  ...  0.656330  0.691049  0.000000  0.146789  0.813301
19     U20  0.165567  0.540738  0.000000  0.684211  ...  0.290191  0.135557  0.447368  0.681466  0.233070
20     U21  0.787351  0.428580  0.747476  0.562244  ...  1.000000  0.809693  0.489696  0.563602  0.771035
21     U22  0.805479  0.588035  0.790950  0.572351  ...  0.809693  1.000000  0.497042  0.403039  0.782601
22     U23  0.182123  0.838144  0.139942  0.631579  ...  0.489696  0.497042  1.000000  0.795044  0.388450
23     U24  0.414455  0.746816  0.181195  0.636035  ...  0.563602  0.403039  0.795044  1.000000  0.335306
24     U25  0.742959  0.574696  0.867595  0.543830  ...  0.771035  0.782601  0.388450  0.335306  1.000000

[25 rows x 26 columns]

如何取两个字典值来求 Python 中的余弦相似度？

How to take two dictionary values to find cosine similarity in Python?

python

loops

trigonometry