在 Python 中删除一列的相似文本行
Drop similar text rows of one column in Python
import pandas as pd
from difflib import SequenceMatcher
df = pd.DataFrame({"id":[9,12,13,14],
"text":["Error number 609 at line 10", "Error number 609 at line 22", "Error string 'foo' at line 11", "Error string 'bar' at line 14"]})
输出:
id text
0 9 Error number 609 at line 10
1 12 Error number 609 at line 22
2 13 Error string 'foo' at line 11
3 14 Error string 'bar' at line 14
我想使用 difflib.SequenceMatcher
删除低于 80
行的相似度分数,只保留一个。
a = "Error number 609 at line 10"
b = "Error number 609 at line 22"
c = "Error string 'foo' at line 11"
d = "Error string 'bar' at line 14"
print(SequenceMatcher(None, a, b).ratio()*100) #92.5925925925926
print(SequenceMatcher(None, b, c).ratio()*100) #60.71428571428571
print(SequenceMatcher(None, c, d).ratio()*100) #86.20689655172413
print(SequenceMatcher(None, a, c).ratio()*100) #64.28571428571429
如何在Python中获得如下预期结果?您可以使用 difflib
或其他 python 包。谢谢。
id text
0 9 Error number 609 at line 10
2 13 Error string 'foo' at line 11
您可以使用:
#cross join with filter onl text column
df = df.assign(a=1).merge(df[['text']].assign(a=1), on='a')
#filter out same columns per rows
df = df[df['text_x'] != df['text_y']]
#sort columns per rows
df[['text_x','text_y']] = pd.DataFrame(np.sort(df[['text_x','text_y']],axis=1), index=df.index)
#remove duplicates
df = df.drop_duplicates(subset=['text_x','text_y'])
#get similarity
df['r'] = df.apply(lambda x: SequenceMatcher(None, x.text_x, x.text_y).ratio(), axis=1)
#filtering
df = df[df['r'] > 0.8].drop(['a','r'], axis=1)
print (df)
id text_x text_y
1 9 Error number 609 at line 10 Error number 609 at line 22
11 13 Error string 'bar' at line 14 Error string 'foo' at line 11
import pandas as pd
from difflib import SequenceMatcher
df = pd.DataFrame({"id":[9,12,13,14],
"text":["Error number 609 at line 10", "Error number 609 at line 22", "Error string 'foo' at line 11", "Error string 'bar' at line 14"]})
输出:
id text
0 9 Error number 609 at line 10
1 12 Error number 609 at line 22
2 13 Error string 'foo' at line 11
3 14 Error string 'bar' at line 14
我想使用 difflib.SequenceMatcher
删除低于 80
行的相似度分数,只保留一个。
a = "Error number 609 at line 10"
b = "Error number 609 at line 22"
c = "Error string 'foo' at line 11"
d = "Error string 'bar' at line 14"
print(SequenceMatcher(None, a, b).ratio()*100) #92.5925925925926
print(SequenceMatcher(None, b, c).ratio()*100) #60.71428571428571
print(SequenceMatcher(None, c, d).ratio()*100) #86.20689655172413
print(SequenceMatcher(None, a, c).ratio()*100) #64.28571428571429
如何在Python中获得如下预期结果?您可以使用 difflib
或其他 python 包。谢谢。
id text
0 9 Error number 609 at line 10
2 13 Error string 'foo' at line 11
您可以使用:
#cross join with filter onl text column
df = df.assign(a=1).merge(df[['text']].assign(a=1), on='a')
#filter out same columns per rows
df = df[df['text_x'] != df['text_y']]
#sort columns per rows
df[['text_x','text_y']] = pd.DataFrame(np.sort(df[['text_x','text_y']],axis=1), index=df.index)
#remove duplicates
df = df.drop_duplicates(subset=['text_x','text_y'])
#get similarity
df['r'] = df.apply(lambda x: SequenceMatcher(None, x.text_x, x.text_y).ratio(), axis=1)
#filtering
df = df[df['r'] > 0.8].drop(['a','r'], axis=1)
print (df)
id text_x text_y
1 9 Error number 609 at line 10 Error number 609 at line 22
11 13 Error string 'bar' at line 14 Error string 'foo' at line 11