Python Pandas 字符串列上的两个数据集的 fuzzywuzzy 'join'
Python Pandas fuzzywuzzy 'join' of two datasets on string columns
我正在关注这个 question 中的答案,它使用 fuzzywuzzy 来 'join' 字符串列上的两个数据集。
我收到一条错误消息,指出我在进行故障排除时遇到问题。
错误提示好像是键值问题。假设那是关于空值的,我将它们过滤掉了,但仍然得到相同的错误消息。
这些字符串是公司名称,可能带有撇号、连字符、句点等。我假设 fuzzywuzzy 可以处理这些,所以不要先删除它们。
是否了解我下一步应该寻找什么来解决此问题?
这是使用 Pandas 从 Excel 个文件导入数据:
import pandas as pd
from fuzzywuzzy import fuzz
import difflib
vendor_file = "vendor.xlsx"
spr_file = "spr.xlsx"
xl_vendor = pd.ExcelFile(vendor_file)
xl_spr = pd.ExcelFile(spr_file)
vendor1 = xl_vendor.parse(xl_vendor.sheet_names[0])
spr1 = xl_spr.parse(xl_spr.sheet_names[0])
spr = spr1[pd.notnull(spr1['Contractor'])]
vendor = vendor1[pd.notnull(vendor1['Vendor Name'])]
这是其他问题中将匹配项与数据集进行匹配和串联的部分:
def get_spr(row):
d = spr.apply(lambda x: fuzz.ratio(x['Vendor Name'], row['Contractor']) * 2 if row['Contractor'] == x['Vendor Name'] else 1, axis=1)
d = d[d >= 75]
if len(d) == 0:
v = ['']*2
else:
v = spr.ix[d.idxmax(), ['Vendor Name', 'Pass/Fail']].values
return pd.Series(v, index=['Vendor Name', 'Pass/Fail'])
# Must be unindented from function indent
pd.concat((vendor, vendor.apply(get_spr, axis=1)), axis=1)
错误是:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-21-41973cb5c3d7> in <module>()
----> 1 pd.concat((vendor, vendor.apply(get_spr, axis=1)), axis=1)
C:\Anaconda\lib\site-packages\pandas\core\frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
3716 if reduce is None:
3717 reduce = True
-> 3718 return self._apply_standard(f, axis, reduce=reduce)
3719 else:
3720 return self._apply_broadcast(f, axis)
C:\Anaconda\lib\site-packages\pandas\core\frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
3806 try:
3807 for i, v in enumerate(series_gen):
-> 3808 results[i] = func(v)
3809 keys.append(v.name)
3810 except Exception as e:
<ipython-input-19-62cc0c6c6daf> in get_spr(row)
1 def get_spr(row):
----> 2 d = spr.apply(lambda x: fuzz.ratio(x['Vendor Name'], row['Contractor']) * 2 if row['Contractor'] == x['Vendor Name'] else 1, axis=1)
3 d = d[d >= 75]
4 if len(d) == 0:
5 v = ['']*2
C:\Anaconda\lib\site-packages\pandas\core\frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
3716 if reduce is None:
3717 reduce = True
-> 3718 return self._apply_standard(f, axis, reduce=reduce)
3719 else:
3720 return self._apply_broadcast(f, axis)
C:\Anaconda\lib\site-packages\pandas\core\frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
3806 try:
3807 for i, v in enumerate(series_gen):
-> 3808 results[i] = func(v)
3809 keys.append(v.name)
3810 except Exception as e:
<ipython-input-19-62cc0c6c6daf> in <lambda>(x)
1 def get_spr(row):
----> 2 d = spr.apply(lambda x: fuzz.ratio(x['Vendor Name'], row['Contractor']) * 2 if row['Contractor'] == x['Vendor Name'] else 1, axis=1)
3 d = d[d >= 75]
4 if len(d) == 0:
5 v = ['']*2
C:\Anaconda\lib\site-packages\pandas\core\series.pyc in __getitem__(self, key)
519 def __getitem__(self, key):
520 try:
--> 521 result = self.index.get_value(self, key)
522
523 if not np.isscalar(result):
C:\Anaconda\lib\site-packages\pandas\core\index.pyc in get_value(self, series, key)
1607 raise InvalidIndexError(key)
1608 else:
-> 1609 raise e1
1610 except Exception: # pragma: no cover
1611 raise e1
KeyError: ('Contractor', u'occurred at index 3', u'occurred at index 0')
编辑以添加数据框列:
spr: 'Contractor', 'Pass/Fail'
vendor: 'Vendor Name'
编辑以根据 davidshinn 答案添加更正的匹配修订:
def get_spr(row):
d = spr.apply(lambda x: fuzz.ratio(x['Contractor'], row['Vendor Name']) * 2 if row['Vendor Name'] == x['Contractor'] else 1, axis=1)
d = d[d >= 75]
if len(d) == 0:
v = ['']*2
else:
v = spr.ix[d.idxmax(), ['Contractor', 'Pass/Fail']].values
return pd.Series(v, index=['Contractor', 'Pass/Fail'])
能否提供 vendor
和 spr
数据框的列名。您确定 Contractor
是 vendor
数据框中的有效列,因为那是 row['Contractor']
正在尝试访问的数据框。
我正在关注这个 question 中的答案,它使用 fuzzywuzzy 来 'join' 字符串列上的两个数据集。
我收到一条错误消息,指出我在进行故障排除时遇到问题。
错误提示好像是键值问题。假设那是关于空值的,我将它们过滤掉了,但仍然得到相同的错误消息。
这些字符串是公司名称,可能带有撇号、连字符、句点等。我假设 fuzzywuzzy 可以处理这些,所以不要先删除它们。
是否了解我下一步应该寻找什么来解决此问题?
这是使用 Pandas 从 Excel 个文件导入数据:
import pandas as pd
from fuzzywuzzy import fuzz
import difflib
vendor_file = "vendor.xlsx"
spr_file = "spr.xlsx"
xl_vendor = pd.ExcelFile(vendor_file)
xl_spr = pd.ExcelFile(spr_file)
vendor1 = xl_vendor.parse(xl_vendor.sheet_names[0])
spr1 = xl_spr.parse(xl_spr.sheet_names[0])
spr = spr1[pd.notnull(spr1['Contractor'])]
vendor = vendor1[pd.notnull(vendor1['Vendor Name'])]
这是其他问题中将匹配项与数据集进行匹配和串联的部分:
def get_spr(row):
d = spr.apply(lambda x: fuzz.ratio(x['Vendor Name'], row['Contractor']) * 2 if row['Contractor'] == x['Vendor Name'] else 1, axis=1)
d = d[d >= 75]
if len(d) == 0:
v = ['']*2
else:
v = spr.ix[d.idxmax(), ['Vendor Name', 'Pass/Fail']].values
return pd.Series(v, index=['Vendor Name', 'Pass/Fail'])
# Must be unindented from function indent
pd.concat((vendor, vendor.apply(get_spr, axis=1)), axis=1)
错误是:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-21-41973cb5c3d7> in <module>()
----> 1 pd.concat((vendor, vendor.apply(get_spr, axis=1)), axis=1)
C:\Anaconda\lib\site-packages\pandas\core\frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
3716 if reduce is None:
3717 reduce = True
-> 3718 return self._apply_standard(f, axis, reduce=reduce)
3719 else:
3720 return self._apply_broadcast(f, axis)
C:\Anaconda\lib\site-packages\pandas\core\frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
3806 try:
3807 for i, v in enumerate(series_gen):
-> 3808 results[i] = func(v)
3809 keys.append(v.name)
3810 except Exception as e:
<ipython-input-19-62cc0c6c6daf> in get_spr(row)
1 def get_spr(row):
----> 2 d = spr.apply(lambda x: fuzz.ratio(x['Vendor Name'], row['Contractor']) * 2 if row['Contractor'] == x['Vendor Name'] else 1, axis=1)
3 d = d[d >= 75]
4 if len(d) == 0:
5 v = ['']*2
C:\Anaconda\lib\site-packages\pandas\core\frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
3716 if reduce is None:
3717 reduce = True
-> 3718 return self._apply_standard(f, axis, reduce=reduce)
3719 else:
3720 return self._apply_broadcast(f, axis)
C:\Anaconda\lib\site-packages\pandas\core\frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
3806 try:
3807 for i, v in enumerate(series_gen):
-> 3808 results[i] = func(v)
3809 keys.append(v.name)
3810 except Exception as e:
<ipython-input-19-62cc0c6c6daf> in <lambda>(x)
1 def get_spr(row):
----> 2 d = spr.apply(lambda x: fuzz.ratio(x['Vendor Name'], row['Contractor']) * 2 if row['Contractor'] == x['Vendor Name'] else 1, axis=1)
3 d = d[d >= 75]
4 if len(d) == 0:
5 v = ['']*2
C:\Anaconda\lib\site-packages\pandas\core\series.pyc in __getitem__(self, key)
519 def __getitem__(self, key):
520 try:
--> 521 result = self.index.get_value(self, key)
522
523 if not np.isscalar(result):
C:\Anaconda\lib\site-packages\pandas\core\index.pyc in get_value(self, series, key)
1607 raise InvalidIndexError(key)
1608 else:
-> 1609 raise e1
1610 except Exception: # pragma: no cover
1611 raise e1
KeyError: ('Contractor', u'occurred at index 3', u'occurred at index 0')
编辑以添加数据框列:
spr: 'Contractor', 'Pass/Fail'
vendor: 'Vendor Name'
编辑以根据 davidshinn 答案添加更正的匹配修订:
def get_spr(row):
d = spr.apply(lambda x: fuzz.ratio(x['Contractor'], row['Vendor Name']) * 2 if row['Vendor Name'] == x['Contractor'] else 1, axis=1)
d = d[d >= 75]
if len(d) == 0:
v = ['']*2
else:
v = spr.ix[d.idxmax(), ['Contractor', 'Pass/Fail']].values
return pd.Series(v, index=['Contractor', 'Pass/Fail'])
能否提供 vendor
和 spr
数据框的列名。您确定 Contractor
是 vendor
数据框中的有效列,因为那是 row['Contractor']
正在尝试访问的数据框。