重复数据删除 Python - "Records do not line up with data model"
Dedupe Python - "Records do not line up with data model"
我坚持设置 python 和库去重 dedupe.io 以对 postgres 数据库中的一组条目进行去重。错误是 - “记录与数据模型不一致”,这应该很容易解决,但我只是不明白为什么会收到此消息。
我现在拥有的(重点代码并删除了其他功能)
# ## Setup
settings_file = 'lead_dedupe_settings'
training_file = 'lead_dedupe_training.json'
start_time = time.time()
...
def training():
# We'll be using variations on this following select statement to pull
# in campaign donor info.
#
# We did a fair amount of preprocessing of the fields in
""" Define Lead Query """
sql = "select id, phone, mobilephone, postalcode, email from dev_manuel.somedata"
# ## Training
if os.path.exists(settings_file):
print('reading from ', settings_file)
with open(settings_file, 'rb') as sf:
deduper = dedupe.StaticDedupe(sf, num_cores=4)
else:
# Define the fields dedupe will pay attention to
#
# The address, city, and zip fields are often missing, so we'll
# tell dedupe that, and we'll learn a model that take that into
# account
fields = [
{'field': 'id', 'type': 'ShortString'},
{'field': 'phone', 'type': 'String', 'has missing': True},
{'field': 'mobilephone', 'type': 'String', 'has missing': True},
{'field': 'postalcode', 'type': 'ShortString', 'has missing': True},
{'field': 'email', 'type': 'String', 'has missing': True}
]
# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields, num_cores=4)
# connect to db and execute
conn = None
try:
# read the connection parameters
params = config()
# connect to the PostgreSQL server
conn = psycopg2.connect(**params)
print('Connecting to the PostgreSQL database...')
cur = conn.cursor()
# excute sql
cur.execute(sql)
temp_d = dict((i, row) for i, row in enumerate(cur))
print(temp_d)
deduper.sample(temp_d, 10000)
print('Done stage 1')
del temp_d
# close communication with the PostgreSQL database server
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
print('Closed Connection')
# If we have training data saved from a previous run of dedupe,
# look for it an load it in.
#
# __Note:__ if you want to train from
# scratch, delete the training_file
if os.path.exists(training_file):
print('reading labeled examples from ', training_file)
with open(training_file) as tf:
deduper.readTraining(tf)
# ## Active learning
print('starting active labeling...')
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# debug
print(deduper)
# vars(deduper)
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
dedupe.convenience.consoleLabel(deduper)
# When finished, save our labeled, training pairs to disk
with open(training_file, 'w') as tf:
deduper.writeTraining(tf)
# Notice our argument here
#
# `recall` is the proportion of true dupes pairs that the learned
# rules must cover. You may want to reduce this if your are making
# too many blocks and too many comparisons.
deduper.train(recall=0.90)
with open(settings_file, 'wb') as sf:
deduper.writeSettings(sf)
# We can now remove some of the memory hobbing objects we used
# for training
deduper.cleanupTraining()
错误信息是"Records do not line up with data model. The field 'id' is in data_model but not in a record"。如您所见,我将 5 个字段定义为 "learned"。我正在使用的查询 returns 我正是这 5 列,其中包含数据。
的输出
print(temp_d)
是
{0: ('00Q1o00000OjmQmEAJ', '+4955555555', None, '01561', None), 1: ('00Q1o00000JhgSUEAZ', None, '+4915555555', '27729', 'email@aemail.de')}
在我看来,这是重复数据删除库的有效输入。
我试过的
- 我检查了他是否已经写了一个文件作为训练集,这将得到
以某种方式阅读和使用,情况并非如此(代码甚至会说
它)
- 我尝试调试 "deduper" 对象,其中的定义
字段等等进去,我可以看到字段定义
- 查看其他示例,如 csv 或 mysql,它们与我所做的几乎相同。
不对的地方请指点一下
看起来问题可能在于您的 temp_d 是一个元组字典,而不是字典字典的预期输入。我刚开始使用这个包并找到了一个示例 here,它适用于我的目的,它提供了这个功能来设置字典,尽管是从 csv 而不是你的数据拉取。
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row['Id'])
data_d[row_id] = dict(clean_row)
return data_d
我坚持设置 python 和库去重 dedupe.io 以对 postgres 数据库中的一组条目进行去重。错误是 - “记录与数据模型不一致”,这应该很容易解决,但我只是不明白为什么会收到此消息。
我现在拥有的(重点代码并删除了其他功能)
# ## Setup
settings_file = 'lead_dedupe_settings'
training_file = 'lead_dedupe_training.json'
start_time = time.time()
...
def training():
# We'll be using variations on this following select statement to pull
# in campaign donor info.
#
# We did a fair amount of preprocessing of the fields in
""" Define Lead Query """
sql = "select id, phone, mobilephone, postalcode, email from dev_manuel.somedata"
# ## Training
if os.path.exists(settings_file):
print('reading from ', settings_file)
with open(settings_file, 'rb') as sf:
deduper = dedupe.StaticDedupe(sf, num_cores=4)
else:
# Define the fields dedupe will pay attention to
#
# The address, city, and zip fields are often missing, so we'll
# tell dedupe that, and we'll learn a model that take that into
# account
fields = [
{'field': 'id', 'type': 'ShortString'},
{'field': 'phone', 'type': 'String', 'has missing': True},
{'field': 'mobilephone', 'type': 'String', 'has missing': True},
{'field': 'postalcode', 'type': 'ShortString', 'has missing': True},
{'field': 'email', 'type': 'String', 'has missing': True}
]
# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields, num_cores=4)
# connect to db and execute
conn = None
try:
# read the connection parameters
params = config()
# connect to the PostgreSQL server
conn = psycopg2.connect(**params)
print('Connecting to the PostgreSQL database...')
cur = conn.cursor()
# excute sql
cur.execute(sql)
temp_d = dict((i, row) for i, row in enumerate(cur))
print(temp_d)
deduper.sample(temp_d, 10000)
print('Done stage 1')
del temp_d
# close communication with the PostgreSQL database server
cur.close()
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
print('Closed Connection')
# If we have training data saved from a previous run of dedupe,
# look for it an load it in.
#
# __Note:__ if you want to train from
# scratch, delete the training_file
if os.path.exists(training_file):
print('reading labeled examples from ', training_file)
with open(training_file) as tf:
deduper.readTraining(tf)
# ## Active learning
print('starting active labeling...')
# Starts the training loop. Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# debug
print(deduper)
# vars(deduper)
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
dedupe.convenience.consoleLabel(deduper)
# When finished, save our labeled, training pairs to disk
with open(training_file, 'w') as tf:
deduper.writeTraining(tf)
# Notice our argument here
#
# `recall` is the proportion of true dupes pairs that the learned
# rules must cover. You may want to reduce this if your are making
# too many blocks and too many comparisons.
deduper.train(recall=0.90)
with open(settings_file, 'wb') as sf:
deduper.writeSettings(sf)
# We can now remove some of the memory hobbing objects we used
# for training
deduper.cleanupTraining()
错误信息是"Records do not line up with data model. The field 'id' is in data_model but not in a record"。如您所见,我将 5 个字段定义为 "learned"。我正在使用的查询 returns 我正是这 5 列,其中包含数据。
的输出print(temp_d)
是
{0: ('00Q1o00000OjmQmEAJ', '+4955555555', None, '01561', None), 1: ('00Q1o00000JhgSUEAZ', None, '+4915555555', '27729', 'email@aemail.de')}
在我看来,这是重复数据删除库的有效输入。
我试过的
- 我检查了他是否已经写了一个文件作为训练集,这将得到 以某种方式阅读和使用,情况并非如此(代码甚至会说 它)
- 我尝试调试 "deduper" 对象,其中的定义 字段等等进去,我可以看到字段定义
- 查看其他示例,如 csv 或 mysql,它们与我所做的几乎相同。
不对的地方请指点一下
看起来问题可能在于您的 temp_d 是一个元组字典,而不是字典字典的预期输入。我刚开始使用这个包并找到了一个示例 here,它适用于我的目的,它提供了这个功能来设置字典,尽管是从 csv 而不是你的数据拉取。
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row['Id'])
data_d[row_id] = dict(clean_row)
return data_d