如何使用 fuzzywuzzy 根据 dataframe1 对 dataframe2 进行排序
how to sort dataframe2 according to dataframe1 with fuzzywuzzy
我知道这是个老问题,事实上我已经看到很多与我的问题相关的链接:
How to compare a value in one dataframe to a column in another using fuzzywuzzy ratio
What's the best way to use fuzzywuzzy to compare each value of a column with all the values of a separate dataframe's column?
但是我没有得到任何合适的解决方案
下面是我的代码:
g = [{'column1': 'ryzen 5 5600'},{'column1':'ram 8 gb ddr4 3.2ghz'}, {'column2':'SSD
220gb'}, {'column3':'windows 10 prof'},
{'column2':'ryzen 5 3600'}, {'column1':'ram 16 gb ddr4'}]
df1=pd.read_excel('product1.xlsx', header=None, index_col=False)
s = []
for l in df1.values:
l = ', '.join(l)
s.append(l)
s = ', '.join(s)
MIN_MATCH_SCORE = 30
guessed_word = [d for d in g if fuzz.token_set_ratio(s, list(d.values())[0]) >= 30]
产品 1 包含:
0 GB ddr4
1 HDD 256GB
2 SSD
3 ryzen 5
4 Win 10 Pro
guessed_word 包含:
#gives good output
[{'column1': 'ryzen 5 5600'},
{'column1': 'ram 8 gb ddr4 3.2ghz'},
{'column2': 'SSD 220gb'},
{'column3': 'windows 10 prof'},
{'column2': 'ryzen 5 3600'},
{'column1': 'ram 16 gb ddr4'}]
附加到数据框后:
df3 = pd.Dataframe(guessed_word)
df3 包含:
column1 column2 column3
ryzen 5 5600 SSD 220gb windows 10 prof
ram 8 gb ddr4 3.2ghz ryzen 5 3600
ram 16 gb ddr4
但我想要以下输出:
#product1 column1 column2 column3
0 GB ddr4 ram 8 gb ddr4 3.2ghz, ram 16 gb ddr4 NAN NAN
1 HDD 256GB NAN NAN NAN
2 SSD NAN SSD 220gb NAN
3 ryzen 5 ryzen 5 5600 ryzen 5 3600 NAN
4 Win 10 Pro NAN NAN windows 10 prof
是否可以使用 df.sort_values 或其他方式进行排序?
我试过了,其中 none 有效。
代码有点长,但它完全符合您的预期。
import re
import pandas as pd
#from fuzzywuzzy import fuzz, process
class CustomMatcher:
def add_space_before_numbers(self, text):
return (re.sub(r'([0-9\.]+)', r' ', text)).replace(' ', ' ')
def add_space_before_numbers(self, text):
return re.sub(r'([0-9\.]+)', r' ', text)
def add_space_after_numbers(self, text):
return re.sub(r'([0-9\.]+)([^0-9\.])', r' ', text)
def pad_spaces(self, text):
result = self.add_space_before_numbers(text)
result = self.add_space_after_numbers(result)
return result.replace(' ', ' ')
def partial_word_score(self, word1, word2):
score = 0
len1 = len(word1)
len2 = len(word2)
if len2 > len1:
temp = word2
word2 = [*word1]
word1 = [*temp]
else:
word2 = [*word2]
word1 = [*word1]
for i, char in enumerate(word2):
if word1[i] == char:
score = score + 1
if min(len1, len2) != 0:
return (score*100) / min(len1, len2)
else:
return 0
def match(self, comparand, target):
len_c = len(comparand)
len_t = len(target)
comparand_words = self.pad_spaces(comparand.lower()).split(' ')
target_words = self.pad_spaces(target.lower()).split(' ')
complete_score = 0
for t_word in target_words:
for c_word in comparand_words:
len1 = len(t_word)
len2 = len(c_word)
word_score = self.partial_word_score(t_word, c_word)\
* (min(len1, len2) / min(len_c, len_t))
complete_score = complete_score + word_score
return complete_score
search_array = [
{'column1': 'ryzen 5 5600'},
{'column1': 'ram 8 gb ddr4 3.2ghz'},
{'column2': 'SSD 220gb'},
{'column3': 'windows 10 prof'},
{'column2': 'ryzen 5 3600'},
{'column1': 'ram 16 gb ddr4'}
]
search_dict = {}
for entry in search_array:
key = [*entry][0]
value = entry[key]
if key in [*search_dict]:
search_dict[key].append(value)
else:
search_dict[key] = [value]
filename = 'product1.xlsx'
products_sheet = pd.read_excel(filename, header=None, index_col=False)
#word_set = ', '.join([x[0] for x in products_sheet.values.tolist()])
#MIN_MATCH_SCORE = 30
products_list = [x[0] for x in products_sheet.values.tolist()]
# Column #1
result_data = {}
result_data[filename.replace('.xlsx','')] = products_list
# Initialize columns #2-#n and populate it with placeholder values
columns = [*search_dict]
for column in columns:
result_data[column]=list(range(products_list.__len__()))
for row_no, row in enumerate(products_list):
for column in columns:
matched_products_list=[]
for product in search_dict[column]:
print(f'Comparing {row} to {product} is:\t', end='')
cm = CustomMatcher()
matching_score = cm.match(row, product)
if matching_score>50:
#if fuzz.token_set_ratio(row, product)>25:
print(matching_score, ' accepted')
matched_products_list.append(product)
else:
print (matching_score, ' rejected')
if (matched_products_list != []):
result_data[column][row_no] = matched_products_list
else:
result_data[column][row_no] = 'NAN'
result_df = pd.DataFrame(data=result_data)
print(result_df)
备注:
我创建了一个 CustomMatcher
而不是使用这个 fuzzywuzzy
东西,它表现得太疯狂以至于没有有意义的阈值水平来过滤。 CustomMatcher
在计算分数时是基于单词的,但是基于字母的比较。它将数字隔离为要在用空格填充后要匹配的单词。这 50 多行可以通过函数 CustomMatcher.match(word1, word2)
轻松访问 我使用 matching_score>50
作为您的应用程序匹配的合理灵敏度阈值。
您不需要在单个单元格中定义条目之间的连接,相反,我使用列表来简化对每个单独单元的访问。
输出被打包为pandas数据帧。
谢谢,
我知道这是个老问题,事实上我已经看到很多与我的问题相关的链接:
How to compare a value in one dataframe to a column in another using fuzzywuzzy ratio
What's the best way to use fuzzywuzzy to compare each value of a column with all the values of a separate dataframe's column?
但是我没有得到任何合适的解决方案
下面是我的代码:
g = [{'column1': 'ryzen 5 5600'},{'column1':'ram 8 gb ddr4 3.2ghz'}, {'column2':'SSD
220gb'}, {'column3':'windows 10 prof'},
{'column2':'ryzen 5 3600'}, {'column1':'ram 16 gb ddr4'}]
df1=pd.read_excel('product1.xlsx', header=None, index_col=False)
s = []
for l in df1.values:
l = ', '.join(l)
s.append(l)
s = ', '.join(s)
MIN_MATCH_SCORE = 30
guessed_word = [d for d in g if fuzz.token_set_ratio(s, list(d.values())[0]) >= 30]
产品 1 包含:
0 GB ddr4
1 HDD 256GB
2 SSD
3 ryzen 5
4 Win 10 Pro
guessed_word 包含:
#gives good output
[{'column1': 'ryzen 5 5600'},
{'column1': 'ram 8 gb ddr4 3.2ghz'},
{'column2': 'SSD 220gb'},
{'column3': 'windows 10 prof'},
{'column2': 'ryzen 5 3600'},
{'column1': 'ram 16 gb ddr4'}]
附加到数据框后:
df3 = pd.Dataframe(guessed_word)
df3 包含:
column1 column2 column3
ryzen 5 5600 SSD 220gb windows 10 prof
ram 8 gb ddr4 3.2ghz ryzen 5 3600
ram 16 gb ddr4
但我想要以下输出:
#product1 column1 column2 column3
0 GB ddr4 ram 8 gb ddr4 3.2ghz, ram 16 gb ddr4 NAN NAN
1 HDD 256GB NAN NAN NAN
2 SSD NAN SSD 220gb NAN
3 ryzen 5 ryzen 5 5600 ryzen 5 3600 NAN
4 Win 10 Pro NAN NAN windows 10 prof
是否可以使用 df.sort_values 或其他方式进行排序? 我试过了,其中 none 有效。
代码有点长,但它完全符合您的预期。
import re
import pandas as pd
#from fuzzywuzzy import fuzz, process
class CustomMatcher:
def add_space_before_numbers(self, text):
return (re.sub(r'([0-9\.]+)', r' ', text)).replace(' ', ' ')
def add_space_before_numbers(self, text):
return re.sub(r'([0-9\.]+)', r' ', text)
def add_space_after_numbers(self, text):
return re.sub(r'([0-9\.]+)([^0-9\.])', r' ', text)
def pad_spaces(self, text):
result = self.add_space_before_numbers(text)
result = self.add_space_after_numbers(result)
return result.replace(' ', ' ')
def partial_word_score(self, word1, word2):
score = 0
len1 = len(word1)
len2 = len(word2)
if len2 > len1:
temp = word2
word2 = [*word1]
word1 = [*temp]
else:
word2 = [*word2]
word1 = [*word1]
for i, char in enumerate(word2):
if word1[i] == char:
score = score + 1
if min(len1, len2) != 0:
return (score*100) / min(len1, len2)
else:
return 0
def match(self, comparand, target):
len_c = len(comparand)
len_t = len(target)
comparand_words = self.pad_spaces(comparand.lower()).split(' ')
target_words = self.pad_spaces(target.lower()).split(' ')
complete_score = 0
for t_word in target_words:
for c_word in comparand_words:
len1 = len(t_word)
len2 = len(c_word)
word_score = self.partial_word_score(t_word, c_word)\
* (min(len1, len2) / min(len_c, len_t))
complete_score = complete_score + word_score
return complete_score
search_array = [
{'column1': 'ryzen 5 5600'},
{'column1': 'ram 8 gb ddr4 3.2ghz'},
{'column2': 'SSD 220gb'},
{'column3': 'windows 10 prof'},
{'column2': 'ryzen 5 3600'},
{'column1': 'ram 16 gb ddr4'}
]
search_dict = {}
for entry in search_array:
key = [*entry][0]
value = entry[key]
if key in [*search_dict]:
search_dict[key].append(value)
else:
search_dict[key] = [value]
filename = 'product1.xlsx'
products_sheet = pd.read_excel(filename, header=None, index_col=False)
#word_set = ', '.join([x[0] for x in products_sheet.values.tolist()])
#MIN_MATCH_SCORE = 30
products_list = [x[0] for x in products_sheet.values.tolist()]
# Column #1
result_data = {}
result_data[filename.replace('.xlsx','')] = products_list
# Initialize columns #2-#n and populate it with placeholder values
columns = [*search_dict]
for column in columns:
result_data[column]=list(range(products_list.__len__()))
for row_no, row in enumerate(products_list):
for column in columns:
matched_products_list=[]
for product in search_dict[column]:
print(f'Comparing {row} to {product} is:\t', end='')
cm = CustomMatcher()
matching_score = cm.match(row, product)
if matching_score>50:
#if fuzz.token_set_ratio(row, product)>25:
print(matching_score, ' accepted')
matched_products_list.append(product)
else:
print (matching_score, ' rejected')
if (matched_products_list != []):
result_data[column][row_no] = matched_products_list
else:
result_data[column][row_no] = 'NAN'
result_df = pd.DataFrame(data=result_data)
print(result_df)
备注:
我创建了一个
CustomMatcher
而不是使用这个fuzzywuzzy
东西,它表现得太疯狂以至于没有有意义的阈值水平来过滤。CustomMatcher
在计算分数时是基于单词的,但是基于字母的比较。它将数字隔离为要在用空格填充后要匹配的单词。这 50 多行可以通过函数CustomMatcher.match(word1, word2)
轻松访问 我使用matching_score>50
作为您的应用程序匹配的合理灵敏度阈值。您不需要在单个单元格中定义条目之间的连接,相反,我使用列表来简化对每个单独单元的访问。
输出被打包为pandas数据帧。
谢谢,