近似重复检测 python sql
near duplicate detection python sql
我有一个庞大的数据集,其中包含来自不同来源的 shipper/supplier 个名称,并且其中有近乎重复的值。
我在互联网上尝试了很多不同的技术,但是 none 对这些庞大的数据来说,它们都令人不满意或者太慢了。
我发现这个 openrefine GitHub repo 用于指纹识别算法,我添加了更多代码,它解决了我的目的。
看看。
我的数据集看起来像这样...
import re, string
import pandas as pd
from unidecode import unidecode
from collections import defaultdict
# clean the text before processing
def cleansing_special_characters(txt):
seps = [' ',';',':','.','`','~',',','*','#','@','|','\','-','_','?','%','!','^','(',')','[',']','{','}','$','=','+','"','<','>',"'",' AND ', ' and ']
default_sep = seps[0]
txt = str(txt)
for sep in seps[1:]:
if sep == " AND " or sep == " and ":
txt = txt.upper()
txt = txt.replace(sep, ' & ')
else:
txt = txt.upper()
txt = txt.replace(sep, default_sep)
try :
list(map(int,txt.split()))
txt = 'NUMBERS'
except:
pass
txt = re.sub(' +', ' ', txt)
temp_list = [i.strip() for i in txt.split(default_sep)]
temp_list = [i for i in temp_list if i]
return " ".join(temp_list)
punctuation = re.compile('[%s]' % re.escape(string.punctuation))
class fingerprinter(object):
# __init__function
def __init__(self, string):
self.string = self._preprocess(string)
# strip leading, trailing spaces and to lower case
def _preprocess(self, string):
return punctuation.sub('',string.strip().lower())
def _latinize(self, string):
return unidecode(string)
# return unidecode(string.decode('utf-8'))
def _unique_preserve_order(self,seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
#-####################################################
def get_fingerprint(self):
return self._latinize(' '.join(self._unique_preserve_order(sorted(self.string.split()))))
def get_ngram_fingerprint(self, n=1):
return self._latinize(''.join(self._unique_preserve_order(sorted([self.string[i:i + n] for i in range(len(self.string) - n +1)]))))
# read excel file
df = pd.read_excel('Input_File.xlsx')
#preprocess the column
df['Clean'] = df['SUPPLIER_NAME'].apply(cleansing_special_characters)
# step 1 cleanining
# ##for n_gram fingerprint algorithm
###########################################################################################
df['n_gram_fingerprint_n2'] = df['Clean'].apply(lambda x : fingerprinter(x.replace(" ","")).get_ngram_fingerprint(n=2))
## generate tag_id for every unique generated n_gram_fingerprint
d = defaultdict(lambda: len(d))
df['tag_idn']=[d[x] for x in df['n_gram_fingerprint_n2']]
###########################################################################################
#drop n_gram column
df.drop(columns=['n_gram_fingerprint_n2'], inplace=True)
# make copy to create group of tag_id
df1 = df[['SUPPLIER_NAME','tag_idn']]
# drop SUPPLIER_NAME column , we have tag_id's now
df.drop(columns=['SUPPLIER_NAME'], inplace=True)
# group df with tag_id with selecting minimum
#group = df.groupby('tag_id').min().reset_index()
group = df.loc[df["Clean"].str.len().groupby(df["tag_idn"]).idxmax()]
# join both the data frames group(unique) and main data
df_merge = pd.merge(df1,group, on=['tag_idn'])
# # output excel file
df_merge.to_excel('Output_File.xlsx', index = False)
这是 excel 文件中的输出数据的样子
我有一个庞大的数据集,其中包含来自不同来源的 shipper/supplier 个名称,并且其中有近乎重复的值。 我在互联网上尝试了很多不同的技术,但是 none 对这些庞大的数据来说,它们都令人不满意或者太慢了。
我发现这个 openrefine GitHub repo 用于指纹识别算法,我添加了更多代码,它解决了我的目的。 看看。
我的数据集看起来像这样...
import re, string
import pandas as pd
from unidecode import unidecode
from collections import defaultdict
# clean the text before processing
def cleansing_special_characters(txt):
seps = [' ',';',':','.','`','~',',','*','#','@','|','\','-','_','?','%','!','^','(',')','[',']','{','}','$','=','+','"','<','>',"'",' AND ', ' and ']
default_sep = seps[0]
txt = str(txt)
for sep in seps[1:]:
if sep == " AND " or sep == " and ":
txt = txt.upper()
txt = txt.replace(sep, ' & ')
else:
txt = txt.upper()
txt = txt.replace(sep, default_sep)
try :
list(map(int,txt.split()))
txt = 'NUMBERS'
except:
pass
txt = re.sub(' +', ' ', txt)
temp_list = [i.strip() for i in txt.split(default_sep)]
temp_list = [i for i in temp_list if i]
return " ".join(temp_list)
punctuation = re.compile('[%s]' % re.escape(string.punctuation))
class fingerprinter(object):
# __init__function
def __init__(self, string):
self.string = self._preprocess(string)
# strip leading, trailing spaces and to lower case
def _preprocess(self, string):
return punctuation.sub('',string.strip().lower())
def _latinize(self, string):
return unidecode(string)
# return unidecode(string.decode('utf-8'))
def _unique_preserve_order(self,seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
#-####################################################
def get_fingerprint(self):
return self._latinize(' '.join(self._unique_preserve_order(sorted(self.string.split()))))
def get_ngram_fingerprint(self, n=1):
return self._latinize(''.join(self._unique_preserve_order(sorted([self.string[i:i + n] for i in range(len(self.string) - n +1)]))))
# read excel file
df = pd.read_excel('Input_File.xlsx')
#preprocess the column
df['Clean'] = df['SUPPLIER_NAME'].apply(cleansing_special_characters)
# step 1 cleanining
# ##for n_gram fingerprint algorithm
###########################################################################################
df['n_gram_fingerprint_n2'] = df['Clean'].apply(lambda x : fingerprinter(x.replace(" ","")).get_ngram_fingerprint(n=2))
## generate tag_id for every unique generated n_gram_fingerprint
d = defaultdict(lambda: len(d))
df['tag_idn']=[d[x] for x in df['n_gram_fingerprint_n2']]
###########################################################################################
#drop n_gram column
df.drop(columns=['n_gram_fingerprint_n2'], inplace=True)
# make copy to create group of tag_id
df1 = df[['SUPPLIER_NAME','tag_idn']]
# drop SUPPLIER_NAME column , we have tag_id's now
df.drop(columns=['SUPPLIER_NAME'], inplace=True)
# group df with tag_id with selecting minimum
#group = df.groupby('tag_id').min().reset_index()
group = df.loc[df["Clean"].str.len().groupby(df["tag_idn"]).idxmax()]
# join both the data frames group(unique) and main data
df_merge = pd.merge(df1,group, on=['tag_idn'])
# # output excel file
df_merge.to_excel('Output_File.xlsx', index = False)
这是 excel 文件中的输出数据的样子