如何在 python 脚本中使用 upsertField 或 upsert 参数将 json 数据导入 mongo 集合
How to use upsertField or upsert parameter in python script to import the json data in mongo collection
mongoimport -u "username" -p "password" --authenticationDatabase "admin" -d my_db -c mycollection --jsonArray --upsert --upsertFields recipe_id C:/Users/mydata.json
我正在使用上面的 mongo 导入查询将 json 文件导入 mongo 集合,它按预期工作。在这里,我使用 recipe_id 作为唯一的过滤器参数,因此在导入时如果文档具有相同的 recipe_id 它可以跳过或替换数据并仅导入新数据。
现在我想在 python 中实现相同的功能,这样我就可以通过 python 脚本使用它,而不是手动 运行。我尝试使用带有 insert_many 方法的 upsert 作为参数之一,但它没有用。
with open(import_file) as f:
file_data = json.load(f)
new_collection.insert_many(file_data)
client.close()
因为我每天都在不断获取这个 json 数据,这肯定会包含重复数据,所以我想在将它导入到 mongodb 集合时对其进行过滤,以便只有新数据不断附加到数据库上,不会有数据冗余。
任何建议都会很有帮助
样本json数据:
[
{
"location_id": 11111,
"recipe_id": "AB8974",
"serving_size_number": 1,
"recipe_fraction_description": null,
"description": "1/2 gallon",
"recipe_name": "ALMOND MILK 32 OZ",
"marketing_name": "Almond Milk",
"marketing_description": null,
"ingredient_statement": "Almond Milk (ALMOND MILK (FILTERED WATER, ALMONDS), CANE SUGAR, CONTAINS 2% OR LESS OF: VITAMIN AND MINERAL BLEND (CALCIUM CARBONATE, VITAMIN E ACETATE, VITAMIN A PALMITATE, VITAMIN D2), SEA SALT, SUNFLOWER LECITHIN, LOCUST BEAN GUM, GELLAN GUM.)",
"allergen_attributes": {
"allergen_statement_not_available": null,
"contains_shellfish": "NO",
"contains_peanut": "NO",
"contains_tree_nuts": "YES",
"contains_milk": "NO",
"contains_wheat": "NO",
"contains_soy": "NO",
"contains_eggs": "NO",
"contains_fish": "NO",
"contains_added_msg": "UNKNOWN",
"contains_hfcs": "UNKNOWN",
"contains_mustard": "UNKNOWN",
"contains_celery": "UNKNOWN",
"contains_sesame": "UNKNOWN",
"contains_red_yellow_blue_dye": "UNKNOWN",
"gluten_free_per_fda": "UNKNOWN",
"non_gmo_claim": "UNKNOWN",
"contains_gluten": "NO"
},
"dietary_attributes": {
"vegan": "YES",
"vegetarian": "YES",
"kosher": "YES",
"halal": "UNKNOWN"
},
"primary_attributes": {
"protein": 7.543,
"total_fat": 19.022,
"carbohydrate": 69.196,
"calories": 463.227,
"total_sugars": 61.285,
"fiber": 5.81,
"calcium": 3840.228,
"iron": 3.955,
"potassium": 270.768,
"sodium": 1351.208,
"cholesterol": 0.0,
"trans_fat": 0.0,
"saturated_fat": 1.488,
"monounsaturated_fat": 11.743,
"polyunsaturated_fat": 4.832,
"calories_from_fat": 171.195,
"pct_calories_from_fat": 36.957,
"pct_calories_from_saturated_fat": 2.892,
"added_sugars": null,
"vitamin_d_(mcg)": null
},
"secondary_attributes": {
"ash": null,
"water": null,
"magnesium": 120.654,
"phosphorous": 171.215,
"zinc": 1.019,
"copper": 0.183,
"manganese": null,
"selenium": 1.325,
"vitamin_a_(IU)": 5331.357,
"vitamin_a_(RAE)": null,
"beta_carotene": null,
"alpha_carotene": null,
"vitamin_e_(A-tocopherol)": 49.909,
"vitamin_d_(IU)": null,
"vitamin_c": 0.0,
"thiamin_(B1)": 0.0,
"riboflavin_(B2)": 0.449,
"niacin": 0.979,
"pantothenic_acid": 0.061,
"vitamin_b6": 0.0,
"folacin_(folic_acid)": null,
"vitamin_b12": 0.0,
"vitamin_k": null,
"folic_acid": null,
"folate_food": null,
"folate_DFE": null,
"vitamin_a_(RE)": null,
"pct_calories_from_protein": 6.514,
"pct_calories_from_carbohydrates": 59.751,
"biotin": null,
"niacin_(mg_NE)": null,
"vitamin_e_(IU)": null
}
}
]
这是我将每天获取的样本 json 数据。并且在这个食谱 ID 中将是唯一的,相同的食谱可能会出现在每个文件中以及一些新的食谱中。所以只想在新食谱存在时附加它。
下面是数据库结构示例
> db.my_col1.findOne()
{
"_id" : ObjectId("5f20c8cc1cd23262e7c28e88"),
"location_id" : 11111,
"recipe_id" : "AB8974",
"serving_size_number" : 1,
"recipe_fraction_description" : null,
"description" : "1/2 gallon",
"recipe_name" : "ALMOND MILK 32 OZ",
"marketing_name" : "Almond Milk",
"marketing_description" : null,
"ingredient_statement" : "Almond Milk (ALMOND MILK (FILTERED WATER, ALMONDS), CANE SUGAR, CONTAINS 2% OR LESS OF: VITAMIN AND MINERAL BLEND (CALCIUM CARBONATE, VITAMIN E ACETATE, VITAMIN A PALMITATE, VITAMIN D2), SEA SALT, SUNFLOWER LECITHIN, LOCUST BEAN GUM, GELLAN GUM.)",
"allergen_attributes" : {
"allergen_statement_not_available" : null,
"contains_shellfish" : "NO",
"contains_peanut" : "NO",
"contains_tree_nuts" : "YES",
"contains_milk" : "NO",
"contains_wheat" : "NO",
"contains_soy" : "NO",
"contains_eggs" : "NO",
"contains_fish" : "NO",
"contains_added_msg" : "UNKNOWN",
"contains_hfcs" : "UNKNOWN",
"contains_mustard" : "UNKNOWN",
"contains_celery" : "UNKNOWN",
"contains_sesame" : "UNKNOWN",
"contains_red_yellow_blue_dye" : "UNKNOWN",
"gluten_free_per_fda" : "UNKNOWN",
"non_gmo_claim" : "UNKNOWN",
"contains_gluten" : "NO"
},
"dietary_attributes" : {
"vegan" : "YES",
"vegetarian" : "YES",
"kosher" : "YES",
"halal" : "UNKNOWN"
},
"primary_attributes" : {
"protein" : 7.543,
"total_fat" : 19.022,
"carbohydrate" : 69.196,
"calories" : 463.227,
"total_sugars" : 61.285,
"fiber" : 5.81,
"calcium" : 3840.228,
"iron" : 3.955,
"potassium" : 270.768,
"sodium" : 1351.208,
"cholesterol" : 0,
"trans_fat" : 0,
"saturated_fat" : 1.488,
"monounsaturated_fat" : 11.743,
"polyunsaturated_fat" : 4.832,
"calories_from_fat" : 171.195,
"pct_calories_from_fat" : 36.957,
"pct_calories_from_saturated_fat" : 2.892,
"added_sugars" : null,
"vitamin_d_(mcg)" : null
},
"secondary_attributes" : {
"ash" : null,
"water" : null,
"magnesium" : 120.654,
"phosphorous" : 171.215,
"zinc" : 1.019,
"copper" : 0.183,
"manganese" : null,
"selenium" : 1.325,
"vitamin_a_(IU)" : 5331.357,
"vitamin_a_(RAE)" : null,
"beta_carotene" : null,
"alpha_carotene" : null,
"vitamin_e_(A-tocopherol)" : 49.909,
"vitamin_d_(IU)" : null,
"vitamin_c" : 0,
"thiamin_(B1)" : 0,
"riboflavin_(B2)" : 0.449,
"niacin" : 0.979,
"pantothenic_acid" : 0.061,
"vitamin_b6" : 0,
"folacin_(folic_acid)" : null,
"vitamin_b12" : 0,
"vitamin_k" : null,
"folic_acid" : null,
"folate_food" : null,
"folate_DFE" : null,
"vitamin_a_(RE)" : null,
"pct_calories_from_protein" : 6.514,
"pct_calories_from_carbohydrates" : 59.751,
"biotin" : null,
"niacin_(mg_NE)" : null,
"vitamin_e_(IU)" : null
}
}
在 recipe.recipe_id 字段上添加唯一索引,或者通过检查是否存在匹配文档来围绕它进行编码:
with open(import_file) as f:
file_data = json.load(f)
for data in file_data:
recipe_id = get('recipe_id') # Add error checking if any chance these fields don't exist
if db.mycollection.find_one({'recipe_id': recipe_id}) is None:
db.mycollection.insert_one(data)
mongoimport -u "username" -p "password" --authenticationDatabase "admin" -d my_db -c mycollection --jsonArray --upsert --upsertFields recipe_id C:/Users/mydata.json
我正在使用上面的 mongo 导入查询将 json 文件导入 mongo 集合,它按预期工作。在这里,我使用 recipe_id 作为唯一的过滤器参数,因此在导入时如果文档具有相同的 recipe_id 它可以跳过或替换数据并仅导入新数据。
现在我想在 python 中实现相同的功能,这样我就可以通过 python 脚本使用它,而不是手动 运行。我尝试使用带有 insert_many 方法的 upsert 作为参数之一,但它没有用。
with open(import_file) as f:
file_data = json.load(f)
new_collection.insert_many(file_data)
client.close()
因为我每天都在不断获取这个 json 数据,这肯定会包含重复数据,所以我想在将它导入到 mongodb 集合时对其进行过滤,以便只有新数据不断附加到数据库上,不会有数据冗余。
任何建议都会很有帮助
样本json数据:
[
{
"location_id": 11111,
"recipe_id": "AB8974",
"serving_size_number": 1,
"recipe_fraction_description": null,
"description": "1/2 gallon",
"recipe_name": "ALMOND MILK 32 OZ",
"marketing_name": "Almond Milk",
"marketing_description": null,
"ingredient_statement": "Almond Milk (ALMOND MILK (FILTERED WATER, ALMONDS), CANE SUGAR, CONTAINS 2% OR LESS OF: VITAMIN AND MINERAL BLEND (CALCIUM CARBONATE, VITAMIN E ACETATE, VITAMIN A PALMITATE, VITAMIN D2), SEA SALT, SUNFLOWER LECITHIN, LOCUST BEAN GUM, GELLAN GUM.)",
"allergen_attributes": {
"allergen_statement_not_available": null,
"contains_shellfish": "NO",
"contains_peanut": "NO",
"contains_tree_nuts": "YES",
"contains_milk": "NO",
"contains_wheat": "NO",
"contains_soy": "NO",
"contains_eggs": "NO",
"contains_fish": "NO",
"contains_added_msg": "UNKNOWN",
"contains_hfcs": "UNKNOWN",
"contains_mustard": "UNKNOWN",
"contains_celery": "UNKNOWN",
"contains_sesame": "UNKNOWN",
"contains_red_yellow_blue_dye": "UNKNOWN",
"gluten_free_per_fda": "UNKNOWN",
"non_gmo_claim": "UNKNOWN",
"contains_gluten": "NO"
},
"dietary_attributes": {
"vegan": "YES",
"vegetarian": "YES",
"kosher": "YES",
"halal": "UNKNOWN"
},
"primary_attributes": {
"protein": 7.543,
"total_fat": 19.022,
"carbohydrate": 69.196,
"calories": 463.227,
"total_sugars": 61.285,
"fiber": 5.81,
"calcium": 3840.228,
"iron": 3.955,
"potassium": 270.768,
"sodium": 1351.208,
"cholesterol": 0.0,
"trans_fat": 0.0,
"saturated_fat": 1.488,
"monounsaturated_fat": 11.743,
"polyunsaturated_fat": 4.832,
"calories_from_fat": 171.195,
"pct_calories_from_fat": 36.957,
"pct_calories_from_saturated_fat": 2.892,
"added_sugars": null,
"vitamin_d_(mcg)": null
},
"secondary_attributes": {
"ash": null,
"water": null,
"magnesium": 120.654,
"phosphorous": 171.215,
"zinc": 1.019,
"copper": 0.183,
"manganese": null,
"selenium": 1.325,
"vitamin_a_(IU)": 5331.357,
"vitamin_a_(RAE)": null,
"beta_carotene": null,
"alpha_carotene": null,
"vitamin_e_(A-tocopherol)": 49.909,
"vitamin_d_(IU)": null,
"vitamin_c": 0.0,
"thiamin_(B1)": 0.0,
"riboflavin_(B2)": 0.449,
"niacin": 0.979,
"pantothenic_acid": 0.061,
"vitamin_b6": 0.0,
"folacin_(folic_acid)": null,
"vitamin_b12": 0.0,
"vitamin_k": null,
"folic_acid": null,
"folate_food": null,
"folate_DFE": null,
"vitamin_a_(RE)": null,
"pct_calories_from_protein": 6.514,
"pct_calories_from_carbohydrates": 59.751,
"biotin": null,
"niacin_(mg_NE)": null,
"vitamin_e_(IU)": null
}
}
]
这是我将每天获取的样本 json 数据。并且在这个食谱 ID 中将是唯一的,相同的食谱可能会出现在每个文件中以及一些新的食谱中。所以只想在新食谱存在时附加它。
下面是数据库结构示例
> db.my_col1.findOne()
{
"_id" : ObjectId("5f20c8cc1cd23262e7c28e88"),
"location_id" : 11111,
"recipe_id" : "AB8974",
"serving_size_number" : 1,
"recipe_fraction_description" : null,
"description" : "1/2 gallon",
"recipe_name" : "ALMOND MILK 32 OZ",
"marketing_name" : "Almond Milk",
"marketing_description" : null,
"ingredient_statement" : "Almond Milk (ALMOND MILK (FILTERED WATER, ALMONDS), CANE SUGAR, CONTAINS 2% OR LESS OF: VITAMIN AND MINERAL BLEND (CALCIUM CARBONATE, VITAMIN E ACETATE, VITAMIN A PALMITATE, VITAMIN D2), SEA SALT, SUNFLOWER LECITHIN, LOCUST BEAN GUM, GELLAN GUM.)",
"allergen_attributes" : {
"allergen_statement_not_available" : null,
"contains_shellfish" : "NO",
"contains_peanut" : "NO",
"contains_tree_nuts" : "YES",
"contains_milk" : "NO",
"contains_wheat" : "NO",
"contains_soy" : "NO",
"contains_eggs" : "NO",
"contains_fish" : "NO",
"contains_added_msg" : "UNKNOWN",
"contains_hfcs" : "UNKNOWN",
"contains_mustard" : "UNKNOWN",
"contains_celery" : "UNKNOWN",
"contains_sesame" : "UNKNOWN",
"contains_red_yellow_blue_dye" : "UNKNOWN",
"gluten_free_per_fda" : "UNKNOWN",
"non_gmo_claim" : "UNKNOWN",
"contains_gluten" : "NO"
},
"dietary_attributes" : {
"vegan" : "YES",
"vegetarian" : "YES",
"kosher" : "YES",
"halal" : "UNKNOWN"
},
"primary_attributes" : {
"protein" : 7.543,
"total_fat" : 19.022,
"carbohydrate" : 69.196,
"calories" : 463.227,
"total_sugars" : 61.285,
"fiber" : 5.81,
"calcium" : 3840.228,
"iron" : 3.955,
"potassium" : 270.768,
"sodium" : 1351.208,
"cholesterol" : 0,
"trans_fat" : 0,
"saturated_fat" : 1.488,
"monounsaturated_fat" : 11.743,
"polyunsaturated_fat" : 4.832,
"calories_from_fat" : 171.195,
"pct_calories_from_fat" : 36.957,
"pct_calories_from_saturated_fat" : 2.892,
"added_sugars" : null,
"vitamin_d_(mcg)" : null
},
"secondary_attributes" : {
"ash" : null,
"water" : null,
"magnesium" : 120.654,
"phosphorous" : 171.215,
"zinc" : 1.019,
"copper" : 0.183,
"manganese" : null,
"selenium" : 1.325,
"vitamin_a_(IU)" : 5331.357,
"vitamin_a_(RAE)" : null,
"beta_carotene" : null,
"alpha_carotene" : null,
"vitamin_e_(A-tocopherol)" : 49.909,
"vitamin_d_(IU)" : null,
"vitamin_c" : 0,
"thiamin_(B1)" : 0,
"riboflavin_(B2)" : 0.449,
"niacin" : 0.979,
"pantothenic_acid" : 0.061,
"vitamin_b6" : 0,
"folacin_(folic_acid)" : null,
"vitamin_b12" : 0,
"vitamin_k" : null,
"folic_acid" : null,
"folate_food" : null,
"folate_DFE" : null,
"vitamin_a_(RE)" : null,
"pct_calories_from_protein" : 6.514,
"pct_calories_from_carbohydrates" : 59.751,
"biotin" : null,
"niacin_(mg_NE)" : null,
"vitamin_e_(IU)" : null
}
}
在 recipe.recipe_id 字段上添加唯一索引,或者通过检查是否存在匹配文档来围绕它进行编码:
with open(import_file) as f:
file_data = json.load(f)
for data in file_data:
recipe_id = get('recipe_id') # Add error checking if any chance these fields don't exist
if db.mycollection.find_one({'recipe_id': recipe_id}) is None:
db.mycollection.insert_one(data)