Python 中的重复数据删除
Dedupe in Python
在查看 Python 中用于记录重复数据删除的 Dedupe 库示例时,我发现它在输出中创建了一个 Cluster Id 列文件,根据文档指示哪些记录相互引用。虽然我无法找出 Cluster Id 之间的任何关系,但这对查找重复记录有何帮助。如果有人对此有所了解,请向我解释一下。这是去重的代码。
# This can run either as a python2 or python3 code
from future.builtins import next
import os
import csv
import re
import logging
import optparse
import dedupe
from unidecode import unidecode
input_file = 'data/csv_example_input_with_true_ids.csv'
output_file = 'data/csv_example_output1.csv'
settings_file = 'data/csv_example_learned_settings'
training_file = 'data/csv_example_training.json'
# Clean or process the data
def preProcess(column):
try:
column = column.decode('utf-8')
except AttributeError:
pass
column = unidecode(column)
column = re.sub(' +', ' ', column)
column = re.sub('\n', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()
if not column:
column = None
return column
# Read in the data from CSV file:
def readData(filename):
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row['Id'])
data_d[row_id] = dict(clean_row)
return data_d
print('importing data ...')
data_d = readData(input_file)
if os.path.exists(settings_file):
print('reading from', settings_file)
with open(settings_file, 'rb') as f:
deduper = dedupe.StaticDedupe(f)
else:
fields = [
{'field' : 'Site name', 'type': 'String'},
{'field' : 'Address', 'type': 'String'},
{'field' : 'Zip', 'type': 'Exact', 'has missing' : True},
{'field' : 'Phone', 'type': 'String', 'has missing' : True},
]
deduper = dedupe.Dedupe(fields)
deduper.sample(data_d, 15000)
if os.path.exists(training_file):
print('reading labeled examples from ', training_file)
with open(training_file, 'rb') as f:
deduper.readTraining(f)
print('starting active labeling...')
dedupe.consoleLabel(deduper)
deduper.train()
with open(training_file, 'w') as tf:
deduper.writeTraining(tf)
with open(settings_file, 'wb') as sf:
deduper.writeSettings(sf)
threshold = deduper.threshold(data_d, recall_weight=1)
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold)
print('# duplicate sets', len(clustered_dupes))
cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
id_set, scores = cluster
cluster_d = [data_d[c] for c in id_set]
canonical_rep = dedupe.canonicalize(cluster_d)
for record_id, score in zip(id_set, scores):
cluster_membership[record_id] = {
"cluster id" : cluster_id,
"canonical representation" : canonical_rep,
"confidence": score
}
singleton_id = cluster_id + 1
with open(output_file, 'w') as f_output, open(input_file) as f_input:
writer = csv.writer(f_output)
reader = csv.reader(f_input)
heading_row = next(reader)
heading_row.insert(0, 'confidence_score')
heading_row.insert(0, 'Cluster ID')
canonical_keys = canonical_rep.keys()
for key in canonical_keys:
heading_row.append('canonical_' + key)
writer.writerow(heading_row)
for row in reader:
row_id = int(row[0])
if row_id in cluster_membership:
cluster_id = cluster_membership[row_id]["cluster id"]
canonical_rep = cluster_membership[row_id]["canonical representation"]
row.insert(0, cluster_membership[row_id]['confidence'])
row.insert(0, cluster_id)
for key in canonical_keys:
row.append(canonical_rep[key].encode('utf8'))
else:
row.insert(0, None)
row.insert(0, singleton_id)
singleton_id += 1
for key in canonical_keys:
row.append(None)
writer.writerow(row)
提前致谢
你说得对,Cluster ID
没有任何用途。
您应该将 Cluster ID
视为重复数据删除执行的输出。 Dedupe 对合并您的记录不感兴趣。它的核心重点是尝试识别可能相似的记录。
它通过分配它认为相似的行来做到这一点 Cluster ID
。
作为软件工程师,您的工作就是以智能方式使用该数据并决定您希望如何合并该数据(如果有的话)。
如果我的输入如下:
我的输出将类似于以下内容:
因此,请记住,您的输入记录数应始终与重复数据删除输出记录数相匹配。区别仅在于您有一个新列 "Cluster ID",您现在可以使用它来 "group" 您可能的重复项。
在查看 Python 中用于记录重复数据删除的 Dedupe 库示例时,我发现它在输出中创建了一个 Cluster Id 列文件,根据文档指示哪些记录相互引用。虽然我无法找出 Cluster Id 之间的任何关系,但这对查找重复记录有何帮助。如果有人对此有所了解,请向我解释一下。这是去重的代码。
# This can run either as a python2 or python3 code
from future.builtins import next
import os
import csv
import re
import logging
import optparse
import dedupe
from unidecode import unidecode
input_file = 'data/csv_example_input_with_true_ids.csv'
output_file = 'data/csv_example_output1.csv'
settings_file = 'data/csv_example_learned_settings'
training_file = 'data/csv_example_training.json'
# Clean or process the data
def preProcess(column):
try:
column = column.decode('utf-8')
except AttributeError:
pass
column = unidecode(column)
column = re.sub(' +', ' ', column)
column = re.sub('\n', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()
if not column:
column = None
return column
# Read in the data from CSV file:
def readData(filename):
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row['Id'])
data_d[row_id] = dict(clean_row)
return data_d
print('importing data ...')
data_d = readData(input_file)
if os.path.exists(settings_file):
print('reading from', settings_file)
with open(settings_file, 'rb') as f:
deduper = dedupe.StaticDedupe(f)
else:
fields = [
{'field' : 'Site name', 'type': 'String'},
{'field' : 'Address', 'type': 'String'},
{'field' : 'Zip', 'type': 'Exact', 'has missing' : True},
{'field' : 'Phone', 'type': 'String', 'has missing' : True},
]
deduper = dedupe.Dedupe(fields)
deduper.sample(data_d, 15000)
if os.path.exists(training_file):
print('reading labeled examples from ', training_file)
with open(training_file, 'rb') as f:
deduper.readTraining(f)
print('starting active labeling...')
dedupe.consoleLabel(deduper)
deduper.train()
with open(training_file, 'w') as tf:
deduper.writeTraining(tf)
with open(settings_file, 'wb') as sf:
deduper.writeSettings(sf)
threshold = deduper.threshold(data_d, recall_weight=1)
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold)
print('# duplicate sets', len(clustered_dupes))
cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
id_set, scores = cluster
cluster_d = [data_d[c] for c in id_set]
canonical_rep = dedupe.canonicalize(cluster_d)
for record_id, score in zip(id_set, scores):
cluster_membership[record_id] = {
"cluster id" : cluster_id,
"canonical representation" : canonical_rep,
"confidence": score
}
singleton_id = cluster_id + 1
with open(output_file, 'w') as f_output, open(input_file) as f_input:
writer = csv.writer(f_output)
reader = csv.reader(f_input)
heading_row = next(reader)
heading_row.insert(0, 'confidence_score')
heading_row.insert(0, 'Cluster ID')
canonical_keys = canonical_rep.keys()
for key in canonical_keys:
heading_row.append('canonical_' + key)
writer.writerow(heading_row)
for row in reader:
row_id = int(row[0])
if row_id in cluster_membership:
cluster_id = cluster_membership[row_id]["cluster id"]
canonical_rep = cluster_membership[row_id]["canonical representation"]
row.insert(0, cluster_membership[row_id]['confidence'])
row.insert(0, cluster_id)
for key in canonical_keys:
row.append(canonical_rep[key].encode('utf8'))
else:
row.insert(0, None)
row.insert(0, singleton_id)
singleton_id += 1
for key in canonical_keys:
row.append(None)
writer.writerow(row)
提前致谢
你说得对,Cluster ID
没有任何用途。
您应该将 Cluster ID
视为重复数据删除执行的输出。 Dedupe 对合并您的记录不感兴趣。它的核心重点是尝试识别可能相似的记录。
它通过分配它认为相似的行来做到这一点 Cluster ID
。
作为软件工程师,您的工作就是以智能方式使用该数据并决定您希望如何合并该数据(如果有的话)。
如果我的输入如下:
我的输出将类似于以下内容:
因此,请记住,您的输入记录数应始终与重复数据删除输出记录数相匹配。区别仅在于您有一个新列 "Cluster ID",您现在可以使用它来 "group" 您可能的重复项。