如何取两个字典值来求 Python 中的余弦相似度?
How to take two dictionary values to find cosine similarity in Python?
我需要使用存储在 user_dict 字典中的“用户评分”从两个向量中找到余弦距离相似度。
评级是从 CSV 文件导入的,然后更改为以用户为键的字典,其中包含每个用户评级的值。我的问题是如何循环遍历字典以获取两个用户的评分并使用余弦距离函数获得相似度?
循环不需要将同一用户放在一起比较或以不同的顺序比较相同的用户? (例如,用户 5 与用户 3 和用户 3 与用户 5)
您可以使用 itertools.combinations
(dct
是您的输入字典):
from itertools import combinations
for k1, k2 in combinations(dct.keys(), 2):
# compute cosine similarity between dct[k1] and dct[k2]
...
from scipy import spatial
d = {'U1': [3, 4, 2, 5, 0, 4, 1, 3, 0, 0, 4],
'U2': [2, 3, 1, 0, 3, 0, 2, 0, 0, 3, 0],
'U3': [0, 4, 0, 5, 0, 4, 0, 3, 0, 2, 4],
'U4': [0, 0, 2, 1, 4, 3, 2, 0, 0, 2, 0],
'U5': [0, 0, 0, 5, 0, 4, 0, 3, 0, 0, 4],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U7': [0, 4, 3, 5, 0, 5, 0, 0, 0, 0, 4],
'U8': [4, 3, 0, 3, 4, 2, 2, 0, 2, 3, 2],
'U9': [0, 2, 0, 3, 1, 0, 1, 0, 0, 2, 0],
'U10': [0, 3, 0, 4, 3, 3, 0, 3, 0, 4, 4],
'U11': [2, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2],
'U12': [0, 4, 4, 5, 0, 0, 0, 3, 0, 4, 5],
'U13': [3, 3, 0, 2, 2, 3, 2, 0, 2, 0, 3],
'U14': [0, 3, 4, 5, 0, 5, 0, 0, 0, 4, 0],
'U15': [2, 0, 0, 3, 0, 2, 2, 3, 0, 0, 3],
'U16': [4, 4, 0, 4, 3, 4, 0, 3, 0, 3, 0],
'U17': [0, 2, 0, 3, 1, 0, 2, 0, 1, 0, 3],
'U18': [2, 3, 1, 0, 3, 2, 3, 2, 0, 2, 0],
'U19': [0, 5, 0, 4, 0, 3, 0, 4, 0, 0, 5],
'U20': [0, 0, 3, 0, 3, 0, 4, 0, 2, 0, 0],
'U21': [3, 0, 2, 4, 2, 3, 0, 4, 2, 3, 3],
'U22': [4, 4, 0, 5, 3, 5, 0, 4, 0, 3, 0],
'U23': [3, 0, 0, 0, 3, 0, 2, 0, 0, 4, 0],
'U24': [4, 0, 3, 0, 3, 0, 3, 0, 0, 2, 2],
'U25': [0, 5, 0, 3, 3, 4, 0, 3, 3, 4, 4]}
all_keys = list(d.keys())
for i in range(len(all_keys)):
for j in range(i+1,len(all_keys)):
print(f"Cosine similaity between {all_keys[i]} and {all_keys[j]} is {1 - spatial.distance.cosine(d[all_keys[i]], d[all_keys[j]])}")
或
使用pandas
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
d = {'U1': [3, 4, 2, 5, 0, 4, 1, 3, 0, 0, 4],
'U2': [2, 3, 1, 0, 3, 0, 2, 0, 0, 3, 0],
'U3': [0, 4, 0, 5, 0, 4, 0, 3, 0, 2, 4],
'U4': [0, 0, 2, 1, 4, 3, 2, 0, 0, 2, 0],
'U5': [0, 0, 0, 5, 0, 4, 0, 3, 0, 0, 4],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U7': [0, 4, 3, 5, 0, 5, 0, 0, 0, 0, 4],
'U8': [4, 3, 0, 3, 4, 2, 2, 0, 2, 3, 2],
'U9': [0, 2, 0, 3, 1, 0, 1, 0, 0, 2, 0],
'U10': [0, 3, 0, 4, 3, 3, 0, 3, 0, 4, 4],
'U11': [2, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2],
'U12': [0, 4, 4, 5, 0, 0, 0, 3, 0, 4, 5],
'U13': [3, 3, 0, 2, 2, 3, 2, 0, 2, 0, 3],
'U14': [0, 3, 4, 5, 0, 5, 0, 0, 0, 4, 0],
'U15': [2, 0, 0, 3, 0, 2, 2, 3, 0, 0, 3],
'U16': [4, 4, 0, 4, 3, 4, 0, 3, 0, 3, 0],
'U17': [0, 2, 0, 3, 1, 0, 2, 0, 1, 0, 3],
'U18': [2, 3, 1, 0, 3, 2, 3, 2, 0, 2, 0],
'U19': [0, 5, 0, 4, 0, 3, 0, 4, 0, 0, 5],
'U20': [0, 0, 3, 0, 3, 0, 4, 0, 2, 0, 0],
'U21': [3, 0, 2, 4, 2, 3, 0, 4, 2, 3, 3],
'U22': [4, 4, 0, 5, 3, 5, 0, 4, 0, 3, 0],
'U23': [3, 0, 0, 0, 3, 0, 2, 0, 0, 4, 0],
'U24': [4, 0, 3, 0, 3, 0, 3, 0, 0, 2, 2],
'U25': [0, 5, 0, 3, 3, 4, 0, 3, 3, 4, 4]}
df = pd.DataFrame(d)
cos_df = pd.DataFrame(cosine_similarity(df.T), columns = df.columns)
cos_df.insert(0,"Columns",df.columns)
print(cos_df)
输出:
Columns U1 U2 U3 U4 ... U21 U22 U23 U24 U25
0 U1 1.000000 0.374228 0.902462 0.380803 ... 0.787351 0.805479 0.182123 0.414455 0.742959
1 U2 0.374228 1.000000 0.323498 0.648886 ... 0.428580 0.588035 0.838144 0.746816 0.574696
2 U3 0.902462 0.323498 1.000000 0.367348 ... 0.747476 0.790950 0.139942 0.181195 0.867595
3 U4 0.380803 0.648886 0.367348 1.000000 ... 0.562244 0.572351 0.631579 0.636035 0.543830
4 U5 0.829156 0.000000 0.876038 0.339457 ... 0.770675 0.651439 0.000000 0.137890 0.660241
5 U6 0.348816 0.864242 0.254164 0.650011 ... 0.500694 0.448630 0.707365 0.759113 0.553116
6 U7 0.888018 0.262071 0.870404 0.442141 ... 0.621170 0.642383 0.000000 0.249542 0.712893
7 U8 0.671751 0.808290 0.610121 0.655610 ... 0.735867 0.793363 0.749269 0.711438 0.774202
8 U9 0.561951 0.650011 0.667940 0.483810 ... 0.512989 0.681623 0.483810 0.321246 0.659221
9 U10 0.768376 0.545545 0.905945 0.584094 ... 0.817316 0.810441 0.442495 0.381958 0.930116
10 U11 0.766131 0.625543 0.584602 0.405906 ... 0.606128 0.561442 0.439732 0.700749 0.599162
11 U12 0.769604 0.451144 0.813118 0.329333 ... 0.724166 0.583435 0.250921 0.406111 0.740772
12 U13 0.806747 0.577813 0.687871 0.517409 ... 0.666687 0.708161 0.427425 0.582552 0.757112
13 U14 0.695436 0.436785 0.734756 0.612195 ... 0.644610 0.720248 0.272087 0.293578 0.662689
14 U15 0.849837 0.213504 0.759751 0.337691 ... 0.805629 0.669039 0.259762 0.448449 0.582825
15 U16 0.781028 0.663914 0.757364 0.578184 ... 0.785252 0.992774 0.561179 0.455047 0.783178
16 U17 0.713653 0.409462 0.713247 0.337227 ... 0.528221 0.456211 0.214599 0.396942 0.669745
17 U18 0.569298 0.879408 0.487692 0.733674 ... 0.573070 0.741858 0.709218 0.696631 0.664230
18 U19 0.898717 0.262071 0.949531 0.221071 ... 0.656330 0.691049 0.000000 0.146789 0.813301
19 U20 0.165567 0.540738 0.000000 0.684211 ... 0.290191 0.135557 0.447368 0.681466 0.233070
20 U21 0.787351 0.428580 0.747476 0.562244 ... 1.000000 0.809693 0.489696 0.563602 0.771035
21 U22 0.805479 0.588035 0.790950 0.572351 ... 0.809693 1.000000 0.497042 0.403039 0.782601
22 U23 0.182123 0.838144 0.139942 0.631579 ... 0.489696 0.497042 1.000000 0.795044 0.388450
23 U24 0.414455 0.746816 0.181195 0.636035 ... 0.563602 0.403039 0.795044 1.000000 0.335306
24 U25 0.742959 0.574696 0.867595 0.543830 ... 0.771035 0.782601 0.388450 0.335306 1.000000
[25 rows x 26 columns]
我需要使用存储在 user_dict 字典中的“用户评分”从两个向量中找到余弦距离相似度。
评级是从 CSV 文件导入的,然后更改为以用户为键的字典,其中包含每个用户评级的值。我的问题是如何循环遍历字典以获取两个用户的评分并使用余弦距离函数获得相似度?
循环不需要将同一用户放在一起比较或以不同的顺序比较相同的用户? (例如,用户 5 与用户 3 和用户 3 与用户 5)
您可以使用 itertools.combinations
(dct
是您的输入字典):
from itertools import combinations
for k1, k2 in combinations(dct.keys(), 2):
# compute cosine similarity between dct[k1] and dct[k2]
...
from scipy import spatial
d = {'U1': [3, 4, 2, 5, 0, 4, 1, 3, 0, 0, 4],
'U2': [2, 3, 1, 0, 3, 0, 2, 0, 0, 3, 0],
'U3': [0, 4, 0, 5, 0, 4, 0, 3, 0, 2, 4],
'U4': [0, 0, 2, 1, 4, 3, 2, 0, 0, 2, 0],
'U5': [0, 0, 0, 5, 0, 4, 0, 3, 0, 0, 4],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U7': [0, 4, 3, 5, 0, 5, 0, 0, 0, 0, 4],
'U8': [4, 3, 0, 3, 4, 2, 2, 0, 2, 3, 2],
'U9': [0, 2, 0, 3, 1, 0, 1, 0, 0, 2, 0],
'U10': [0, 3, 0, 4, 3, 3, 0, 3, 0, 4, 4],
'U11': [2, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2],
'U12': [0, 4, 4, 5, 0, 0, 0, 3, 0, 4, 5],
'U13': [3, 3, 0, 2, 2, 3, 2, 0, 2, 0, 3],
'U14': [0, 3, 4, 5, 0, 5, 0, 0, 0, 4, 0],
'U15': [2, 0, 0, 3, 0, 2, 2, 3, 0, 0, 3],
'U16': [4, 4, 0, 4, 3, 4, 0, 3, 0, 3, 0],
'U17': [0, 2, 0, 3, 1, 0, 2, 0, 1, 0, 3],
'U18': [2, 3, 1, 0, 3, 2, 3, 2, 0, 2, 0],
'U19': [0, 5, 0, 4, 0, 3, 0, 4, 0, 0, 5],
'U20': [0, 0, 3, 0, 3, 0, 4, 0, 2, 0, 0],
'U21': [3, 0, 2, 4, 2, 3, 0, 4, 2, 3, 3],
'U22': [4, 4, 0, 5, 3, 5, 0, 4, 0, 3, 0],
'U23': [3, 0, 0, 0, 3, 0, 2, 0, 0, 4, 0],
'U24': [4, 0, 3, 0, 3, 0, 3, 0, 0, 2, 2],
'U25': [0, 5, 0, 3, 3, 4, 0, 3, 3, 4, 4]}
all_keys = list(d.keys())
for i in range(len(all_keys)):
for j in range(i+1,len(all_keys)):
print(f"Cosine similaity between {all_keys[i]} and {all_keys[j]} is {1 - spatial.distance.cosine(d[all_keys[i]], d[all_keys[j]])}")
或
使用pandas
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
d = {'U1': [3, 4, 2, 5, 0, 4, 1, 3, 0, 0, 4],
'U2': [2, 3, 1, 0, 3, 0, 2, 0, 0, 3, 0],
'U3': [0, 4, 0, 5, 0, 4, 0, 3, 0, 2, 4],
'U4': [0, 0, 2, 1, 4, 3, 2, 0, 0, 2, 0],
'U5': [0, 0, 0, 5, 0, 4, 0, 3, 0, 0, 4],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U7': [0, 4, 3, 5, 0, 5, 0, 0, 0, 0, 4],
'U8': [4, 3, 0, 3, 4, 2, 2, 0, 2, 3, 2],
'U9': [0, 2, 0, 3, 1, 0, 1, 0, 0, 2, 0],
'U10': [0, 3, 0, 4, 3, 3, 0, 3, 0, 4, 4],
'U11': [2, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2],
'U12': [0, 4, 4, 5, 0, 0, 0, 3, 0, 4, 5],
'U13': [3, 3, 0, 2, 2, 3, 2, 0, 2, 0, 3],
'U14': [0, 3, 4, 5, 0, 5, 0, 0, 0, 4, 0],
'U15': [2, 0, 0, 3, 0, 2, 2, 3, 0, 0, 3],
'U16': [4, 4, 0, 4, 3, 4, 0, 3, 0, 3, 0],
'U17': [0, 2, 0, 3, 1, 0, 2, 0, 1, 0, 3],
'U18': [2, 3, 1, 0, 3, 2, 3, 2, 0, 2, 0],
'U19': [0, 5, 0, 4, 0, 3, 0, 4, 0, 0, 5],
'U20': [0, 0, 3, 0, 3, 0, 4, 0, 2, 0, 0],
'U21': [3, 0, 2, 4, 2, 3, 0, 4, 2, 3, 3],
'U22': [4, 4, 0, 5, 3, 5, 0, 4, 0, 3, 0],
'U23': [3, 0, 0, 0, 3, 0, 2, 0, 0, 4, 0],
'U24': [4, 0, 3, 0, 3, 0, 3, 0, 0, 2, 2],
'U25': [0, 5, 0, 3, 3, 4, 0, 3, 3, 4, 4]}
df = pd.DataFrame(d)
cos_df = pd.DataFrame(cosine_similarity(df.T), columns = df.columns)
cos_df.insert(0,"Columns",df.columns)
print(cos_df)
输出:
Columns U1 U2 U3 U4 ... U21 U22 U23 U24 U25
0 U1 1.000000 0.374228 0.902462 0.380803 ... 0.787351 0.805479 0.182123 0.414455 0.742959
1 U2 0.374228 1.000000 0.323498 0.648886 ... 0.428580 0.588035 0.838144 0.746816 0.574696
2 U3 0.902462 0.323498 1.000000 0.367348 ... 0.747476 0.790950 0.139942 0.181195 0.867595
3 U4 0.380803 0.648886 0.367348 1.000000 ... 0.562244 0.572351 0.631579 0.636035 0.543830
4 U5 0.829156 0.000000 0.876038 0.339457 ... 0.770675 0.651439 0.000000 0.137890 0.660241
5 U6 0.348816 0.864242 0.254164 0.650011 ... 0.500694 0.448630 0.707365 0.759113 0.553116
6 U7 0.888018 0.262071 0.870404 0.442141 ... 0.621170 0.642383 0.000000 0.249542 0.712893
7 U8 0.671751 0.808290 0.610121 0.655610 ... 0.735867 0.793363 0.749269 0.711438 0.774202
8 U9 0.561951 0.650011 0.667940 0.483810 ... 0.512989 0.681623 0.483810 0.321246 0.659221
9 U10 0.768376 0.545545 0.905945 0.584094 ... 0.817316 0.810441 0.442495 0.381958 0.930116
10 U11 0.766131 0.625543 0.584602 0.405906 ... 0.606128 0.561442 0.439732 0.700749 0.599162
11 U12 0.769604 0.451144 0.813118 0.329333 ... 0.724166 0.583435 0.250921 0.406111 0.740772
12 U13 0.806747 0.577813 0.687871 0.517409 ... 0.666687 0.708161 0.427425 0.582552 0.757112
13 U14 0.695436 0.436785 0.734756 0.612195 ... 0.644610 0.720248 0.272087 0.293578 0.662689
14 U15 0.849837 0.213504 0.759751 0.337691 ... 0.805629 0.669039 0.259762 0.448449 0.582825
15 U16 0.781028 0.663914 0.757364 0.578184 ... 0.785252 0.992774 0.561179 0.455047 0.783178
16 U17 0.713653 0.409462 0.713247 0.337227 ... 0.528221 0.456211 0.214599 0.396942 0.669745
17 U18 0.569298 0.879408 0.487692 0.733674 ... 0.573070 0.741858 0.709218 0.696631 0.664230
18 U19 0.898717 0.262071 0.949531 0.221071 ... 0.656330 0.691049 0.000000 0.146789 0.813301
19 U20 0.165567 0.540738 0.000000 0.684211 ... 0.290191 0.135557 0.447368 0.681466 0.233070
20 U21 0.787351 0.428580 0.747476 0.562244 ... 1.000000 0.809693 0.489696 0.563602 0.771035
21 U22 0.805479 0.588035 0.790950 0.572351 ... 0.809693 1.000000 0.497042 0.403039 0.782601
22 U23 0.182123 0.838144 0.139942 0.631579 ... 0.489696 0.497042 1.000000 0.795044 0.388450
23 U24 0.414455 0.746816 0.181195 0.636035 ... 0.563602 0.403039 0.795044 1.000000 0.335306
24 U25 0.742959 0.574696 0.867595 0.543830 ... 0.771035 0.782601 0.388450 0.335306 1.000000
[25 rows x 26 columns]