TypeError: ('expected string or bytes-like object', 'occurred at index 0') when calling process.extract
TypeError: ('expected string or bytes-like object', 'occurred at index 0') when calling process.extract
当我尝试在 pandas DataFrame 中的列上使用 fuzzywuzzy 库中的 process.extract
时,我收到以下错误消息:
TypeError: ('expected string or bytes-like object', 'occurred at index 0')
背景
我有以下示例 df
:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import nltk
name_list = ['John D Doe', 'Jane L Doe', 'Jack Doe']
text_list = [' Reason for Visit: John D Doe is a Jon has male pattern baldness',
'Jane is related to John and Jan L Doe is his sister ',
'Jack Doe is thier son and jac is five']
df = pd.DataFrame(
{'Names': name_list,
'Text': text_list,
'P_ID': [1,2,3]
})
#tokenize
df['Token_Names'] = df.apply(lambda row: nltk.word_tokenize(row['Names']), axis=1)
df['Token_Text'] = df.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1)
#df
Names Text P_ID Token_Names Token_Text
0 John D Doe Reason for Visit: John D Doe 1 [John, D, Doe] [Reason, for, Visit, :, John, D, Doe, is, a, J...
1 Jane L Doe Jane is related to John 2 [Jane, L, Doe] [Jane, is, related, to, John, and
2 Jack Doe Jack Doe is thier son 3 [Jack, Doe] [Jack, Doe, is, thier, son, and, jac, is, five]
问题
我创建了以下函数
def get_alt_names(token_name, token_text):
if len(token_name) > 1:
extract = process.extract(token_name,token_text, limit = 3, scorer = fuzz.ratio)
return extract
我用 lambda
和 apply
#use apply with extract
df['Alt_Names'] = df.apply(lambda x: get_alt_names(x.Token_Names, x.Token_Text) , axis =1)
但是我得到以下错误:
TypeError Traceback (most recent call last)
<ipython-input-12-6dcc99fa91b0> in <module>()
1 #use apply with extract
----> 2 df['Alt_Names'] = df.apply(lambda x: get_alt_names(x.Token_Names, x.Token_Text) , axis =1)
C:\Anaconda\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6002 args=args,
6003 kwds=kwds)
-> 6004 return op.get_result()
6005
6006 def applymap(self, func):
C:\Anaconda\lib\site-packages\pandas\core\apply.py in get_result(self)
140 return self.apply_raw()
141
--> 142 return self.apply_standard()
143
144 def apply_empty_result(self):
C:\Anaconda\lib\site-packages\pandas\core\apply.py in apply_standard(self)
246
247 # compute the result using the series generator
--> 248 self.apply_series_generator()
249
250 # wrap results
C:\Anaconda\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
275 try:
276 for i, v in enumerate(series_gen):
--> 277 results[i] = self.f(v)
278 keys.append(v.name)
279 except Exception as e:
<ipython-input-12-6dcc99fa91b0> in <lambda>(x)
1 #use apply with extract
----> 2 df['Alt_Names'] = df.apply(lambda x: get_alt_names(x.Token_Names, x.Token_Text) , axis =1)
<ipython-input-10-360a3b67e5d2> in get_alt_names(token_name, token_text)
5 #if len(token_name) inside token_names_unlisted > 1:
6 if len(token_name) > 1:
----> 7 extract = process.extract(token_name,token_text, limit = 3, scorer = fuzz.ratio)
8 return extract
C:\Anaconda\lib\site-packages\fuzzywuzzy\process.py in extract(query, choices, processor, scorer, limit)
166 """
167 sl = extractWithoutOrder(query, choices, processor, scorer)
--> 168 return heapq.nlargest(limit, sl, key=lambda i: i[1]) if limit is not None else \
169 sorted(sl, key=lambda i: i[1], reverse=True)
170
C:\Anaconda\lib\heapq.py in nlargest(n, iterable, key)
567 # General case, slowest method
568 it = iter(iterable)
--> 569 result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
570 if not result:
571 return result
C:\Anaconda\lib\heapq.py in <listcomp>(.0)
567 # General case, slowest method
568 it = iter(iterable)
--> 569 result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
570 if not result:
571 return result
C:\Anaconda\lib\site-packages\fuzzywuzzy\process.py in extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
76
77 # Run the processor on the input query.
---> 78 processed_query = processor(query)
79
80 if len(processed_query) == 0:
C:\Anaconda\lib\site-packages\fuzzywuzzy\utils.py in full_process(s, force_ascii)
93 s = asciidammit(s)
94 # Keep only Letters and Numbers (see Unicode docs).
---> 95 string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
96 # Force into lowercase.
97 string_out = StringProcessor.to_lower_case(string_out)
C:\Anaconda\lib\site-packages\fuzzywuzzy\string_processing.py in replace_non_letters_non_numbers_with_whitespace(cls, a_string)
24 numbers with a single white space.
25 """
---> 26 return cls.regex.sub(" ", a_string)
27
28 strip = staticmethod(string.strip)
TypeError: ('expected string or bytes-like object', 'occurred at index 0')
我认为这是因为我的输入是一个列表
期望输出
我希望输出看起来像下面这样(可能是列表的列表?)
Other_Columns_Here Alt_Names
0 [('John', 100), ('Jon', 86), ('Reason', 40)][('D', 100), ('Doe', 50), ('baldness', 22)][('Doe', 100), ('D', 50), ('baldness', 36)]
1 [('Jane', 100), ('Jan', 86), ('and', 57)] [('L', 100), ('related', 25), ('Jane', 0)][('Doe', 100), ('to', 40), ('and', 33)]
2 [('Doe', 100), ('to', 40), ('and', 33)] [('Doe', 100), ('son', 33), ('and', 33)]
问题
如何解决我的错误?
我认为您需要更改 get_alt_names
使其看起来更像以下版本:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import nltk
name_list = ['John D Doe', 'Jane L Doe', 'Jack Doe']
text_list = [
'Reason for Visit: John D Doe is a Jon has male pattern baldness',
'Jane is related to John and Jan L Doe is his sister ',
'Jack Doe is their son and jac is five'
]
df = pd.DataFrame({
'Names': name_list,
'Text': text_list,
'P_ID': [1,2,3]
})
df['Token_Names'] = df.apply(lambda row: nltk.word_tokenize(row['Names']), axis=1)
df['Token_Text'] = df.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1)
def get_alt_names(s):
token_names = s['Token_Names']
token_text = s['Token_Text']
extract = list()
for name in token_names:
if len(name) > 1:
result = process.extract(name, token_text, limit=3, scorer=fuzz.ratio)
extract.append(result)
return extract
df['Alt_Names'] = df.apply(get_alt_names, axis=1)
print(df)
输出
0 [[(John, 100), (Jon, 86), (Reason, 40)], [(Doe...
1 [[(Jane, 100), (Jan, 86), (and, 57)], [(Doe, 1...
2 [[(Jack, 100), (jac, 86), (and, 29)], [(Doe, 1...
Name: Alt_Names, dtype: object
此代码可以运行,但您可能仍需要修改它以获得您想要的确切结果。具体来说,我不确定您希望 'Alt_Names'
是一个列表列表还是只是一个列表。
当我尝试在 pandas DataFrame 中的列上使用 fuzzywuzzy 库中的 process.extract
时,我收到以下错误消息:
TypeError: ('expected string or bytes-like object', 'occurred at index 0')
背景
我有以下示例 df
:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import nltk
name_list = ['John D Doe', 'Jane L Doe', 'Jack Doe']
text_list = [' Reason for Visit: John D Doe is a Jon has male pattern baldness',
'Jane is related to John and Jan L Doe is his sister ',
'Jack Doe is thier son and jac is five']
df = pd.DataFrame(
{'Names': name_list,
'Text': text_list,
'P_ID': [1,2,3]
})
#tokenize
df['Token_Names'] = df.apply(lambda row: nltk.word_tokenize(row['Names']), axis=1)
df['Token_Text'] = df.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1)
#df
Names Text P_ID Token_Names Token_Text
0 John D Doe Reason for Visit: John D Doe 1 [John, D, Doe] [Reason, for, Visit, :, John, D, Doe, is, a, J...
1 Jane L Doe Jane is related to John 2 [Jane, L, Doe] [Jane, is, related, to, John, and
2 Jack Doe Jack Doe is thier son 3 [Jack, Doe] [Jack, Doe, is, thier, son, and, jac, is, five]
问题
我创建了以下函数
def get_alt_names(token_name, token_text):
if len(token_name) > 1:
extract = process.extract(token_name,token_text, limit = 3, scorer = fuzz.ratio)
return extract
我用 lambda
和 apply
#use apply with extract
df['Alt_Names'] = df.apply(lambda x: get_alt_names(x.Token_Names, x.Token_Text) , axis =1)
但是我得到以下错误:
TypeError Traceback (most recent call last)
<ipython-input-12-6dcc99fa91b0> in <module>()
1 #use apply with extract
----> 2 df['Alt_Names'] = df.apply(lambda x: get_alt_names(x.Token_Names, x.Token_Text) , axis =1)
C:\Anaconda\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6002 args=args,
6003 kwds=kwds)
-> 6004 return op.get_result()
6005
6006 def applymap(self, func):
C:\Anaconda\lib\site-packages\pandas\core\apply.py in get_result(self)
140 return self.apply_raw()
141
--> 142 return self.apply_standard()
143
144 def apply_empty_result(self):
C:\Anaconda\lib\site-packages\pandas\core\apply.py in apply_standard(self)
246
247 # compute the result using the series generator
--> 248 self.apply_series_generator()
249
250 # wrap results
C:\Anaconda\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
275 try:
276 for i, v in enumerate(series_gen):
--> 277 results[i] = self.f(v)
278 keys.append(v.name)
279 except Exception as e:
<ipython-input-12-6dcc99fa91b0> in <lambda>(x)
1 #use apply with extract
----> 2 df['Alt_Names'] = df.apply(lambda x: get_alt_names(x.Token_Names, x.Token_Text) , axis =1)
<ipython-input-10-360a3b67e5d2> in get_alt_names(token_name, token_text)
5 #if len(token_name) inside token_names_unlisted > 1:
6 if len(token_name) > 1:
----> 7 extract = process.extract(token_name,token_text, limit = 3, scorer = fuzz.ratio)
8 return extract
C:\Anaconda\lib\site-packages\fuzzywuzzy\process.py in extract(query, choices, processor, scorer, limit)
166 """
167 sl = extractWithoutOrder(query, choices, processor, scorer)
--> 168 return heapq.nlargest(limit, sl, key=lambda i: i[1]) if limit is not None else \
169 sorted(sl, key=lambda i: i[1], reverse=True)
170
C:\Anaconda\lib\heapq.py in nlargest(n, iterable, key)
567 # General case, slowest method
568 it = iter(iterable)
--> 569 result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
570 if not result:
571 return result
C:\Anaconda\lib\heapq.py in <listcomp>(.0)
567 # General case, slowest method
568 it = iter(iterable)
--> 569 result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
570 if not result:
571 return result
C:\Anaconda\lib\site-packages\fuzzywuzzy\process.py in extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
76
77 # Run the processor on the input query.
---> 78 processed_query = processor(query)
79
80 if len(processed_query) == 0:
C:\Anaconda\lib\site-packages\fuzzywuzzy\utils.py in full_process(s, force_ascii)
93 s = asciidammit(s)
94 # Keep only Letters and Numbers (see Unicode docs).
---> 95 string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
96 # Force into lowercase.
97 string_out = StringProcessor.to_lower_case(string_out)
C:\Anaconda\lib\site-packages\fuzzywuzzy\string_processing.py in replace_non_letters_non_numbers_with_whitespace(cls, a_string)
24 numbers with a single white space.
25 """
---> 26 return cls.regex.sub(" ", a_string)
27
28 strip = staticmethod(string.strip)
TypeError: ('expected string or bytes-like object', 'occurred at index 0')
我认为这是因为我的输入是一个列表
期望输出
我希望输出看起来像下面这样(可能是列表的列表?)
Other_Columns_Here Alt_Names
0 [('John', 100), ('Jon', 86), ('Reason', 40)][('D', 100), ('Doe', 50), ('baldness', 22)][('Doe', 100), ('D', 50), ('baldness', 36)]
1 [('Jane', 100), ('Jan', 86), ('and', 57)] [('L', 100), ('related', 25), ('Jane', 0)][('Doe', 100), ('to', 40), ('and', 33)]
2 [('Doe', 100), ('to', 40), ('and', 33)] [('Doe', 100), ('son', 33), ('and', 33)]
问题
如何解决我的错误?
我认为您需要更改 get_alt_names
使其看起来更像以下版本:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import nltk
name_list = ['John D Doe', 'Jane L Doe', 'Jack Doe']
text_list = [
'Reason for Visit: John D Doe is a Jon has male pattern baldness',
'Jane is related to John and Jan L Doe is his sister ',
'Jack Doe is their son and jac is five'
]
df = pd.DataFrame({
'Names': name_list,
'Text': text_list,
'P_ID': [1,2,3]
})
df['Token_Names'] = df.apply(lambda row: nltk.word_tokenize(row['Names']), axis=1)
df['Token_Text'] = df.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1)
def get_alt_names(s):
token_names = s['Token_Names']
token_text = s['Token_Text']
extract = list()
for name in token_names:
if len(name) > 1:
result = process.extract(name, token_text, limit=3, scorer=fuzz.ratio)
extract.append(result)
return extract
df['Alt_Names'] = df.apply(get_alt_names, axis=1)
print(df)
输出
0 [[(John, 100), (Jon, 86), (Reason, 40)], [(Doe...
1 [[(Jane, 100), (Jan, 86), (and, 57)], [(Doe, 1...
2 [[(Jack, 100), (jac, 86), (and, 29)], [(Doe, 1...
Name: Alt_Names, dtype: object
此代码可以运行,但您可能仍需要修改它以获得您想要的确切结果。具体来说,我不确定您希望 'Alt_Names'
是一个列表列表还是只是一个列表。