如何在 python 脚本中使用 upsertField 或 upsert 参数将 json 数据导入 mongo 集合

Question

mongoimport -u "username" -p "password" --authenticationDatabase "admin" -d my_db -c mycollection --jsonArray --upsert --upsertFields recipe_id C:/Users/mydata.json

我正在使用上面的 mongo 导入查询将 json 文件导入 mongo 集合，它按预期工作。在这里，我使用 recipe_id 作为唯一的过滤器参数，因此在导入时如果文档具有相同的 recipe_id 它可以跳过或替换数据并仅导入新数据。

现在我想在 python 中实现相同的功能，这样我就可以通过 python 脚本使用它，而不是手动运行。我尝试使用带有 insert_many 方法的 upsert 作为参数之一，但它没有用。

with open(import_file) as f:
  file_data = json.load(f)
new_collection.insert_many(file_data)
client.close()

因为我每天都在不断获取这个 json 数据，这肯定会包含重复数据，所以我想在将它导入到 mongodb 集合时对其进行过滤，以便只有新数据不断附加到数据库上，不会有数据冗余。

任何建议都会很有帮助

样本json数据：

[
    {
        "location_id": 11111,
        "recipe_id": "AB8974",
        "serving_size_number": 1,
        "recipe_fraction_description": null,
        "description": "1/2 gallon",
        "recipe_name": "ALMOND MILK 32 OZ",
        "marketing_name": "Almond Milk",
        "marketing_description": null,
        "ingredient_statement": "Almond Milk (ALMOND MILK (FILTERED WATER, ALMONDS), CANE SUGAR, CONTAINS 2% OR LESS OF: VITAMIN AND MINERAL BLEND (CALCIUM CARBONATE, VITAMIN E ACETATE, VITAMIN A PALMITATE, VITAMIN D2), SEA SALT, SUNFLOWER LECITHIN, LOCUST BEAN GUM, GELLAN GUM.)",
        "allergen_attributes": {
            "allergen_statement_not_available": null,
            "contains_shellfish": "NO",
            "contains_peanut": "NO",
            "contains_tree_nuts": "YES",
            "contains_milk": "NO",
            "contains_wheat": "NO",
            "contains_soy": "NO",
            "contains_eggs": "NO",
            "contains_fish": "NO",
            "contains_added_msg": "UNKNOWN",
            "contains_hfcs": "UNKNOWN",
            "contains_mustard": "UNKNOWN",
            "contains_celery": "UNKNOWN",
            "contains_sesame": "UNKNOWN",
            "contains_red_yellow_blue_dye": "UNKNOWN",
            "gluten_free_per_fda": "UNKNOWN",
            "non_gmo_claim": "UNKNOWN",
            "contains_gluten": "NO"
        },
        "dietary_attributes": {
            "vegan": "YES",
            "vegetarian": "YES",
            "kosher": "YES",
            "halal": "UNKNOWN"
        },
        "primary_attributes": {
            "protein": 7.543,
            "total_fat": 19.022,
            "carbohydrate": 69.196,
            "calories": 463.227,
            "total_sugars": 61.285,
            "fiber": 5.81,
            "calcium": 3840.228,
            "iron": 3.955,
            "potassium": 270.768,
            "sodium": 1351.208,
            "cholesterol": 0.0,
            "trans_fat": 0.0,
            "saturated_fat": 1.488,
            "monounsaturated_fat": 11.743,
            "polyunsaturated_fat": 4.832,
            "calories_from_fat": 171.195,
            "pct_calories_from_fat": 36.957,
            "pct_calories_from_saturated_fat": 2.892,
            "added_sugars": null,
            "vitamin_d_(mcg)": null
        },
        "secondary_attributes": {
            "ash": null,
            "water": null,
            "magnesium": 120.654,
            "phosphorous": 171.215,
            "zinc": 1.019,
            "copper": 0.183,
            "manganese": null,
            "selenium": 1.325,
            "vitamin_a_(IU)": 5331.357,
            "vitamin_a_(RAE)": null,
            "beta_carotene": null,
            "alpha_carotene": null,
            "vitamin_e_(A-tocopherol)": 49.909,
            "vitamin_d_(IU)": null,
            "vitamin_c": 0.0,
            "thiamin_(B1)": 0.0,
            "riboflavin_(B2)": 0.449,
            "niacin": 0.979,
            "pantothenic_acid": 0.061,
            "vitamin_b6": 0.0,
            "folacin_(folic_acid)": null,
            "vitamin_b12": 0.0,
            "vitamin_k": null,
            "folic_acid": null,
            "folate_food": null,
            "folate_DFE": null,
            "vitamin_a_(RE)": null,
            "pct_calories_from_protein": 6.514,
            "pct_calories_from_carbohydrates": 59.751,
            "biotin": null,
            "niacin_(mg_NE)": null,
            "vitamin_e_(IU)": null
        }
    }
]

这是我将每天获取的样本 json 数据。并且在这个食谱 ID 中将是唯一的，相同的食谱可能会出现在每个文件中以及一些新的食谱中。所以只想在新食谱存在时附加它。

下面是数据库结构示例

> db.my_col1.findOne()
{
        "_id" : ObjectId("5f20c8cc1cd23262e7c28e88"),
        "location_id" : 11111,
        "recipe_id" : "AB8974",
        "serving_size_number" : 1,
        "recipe_fraction_description" : null,
        "description" : "1/2 gallon",
        "recipe_name" : "ALMOND MILK 32 OZ",
        "marketing_name" : "Almond Milk",
        "marketing_description" : null,
        "ingredient_statement" : "Almond Milk (ALMOND MILK (FILTERED WATER, ALMONDS), CANE SUGAR, CONTAINS 2% OR LESS OF: VITAMIN AND MINERAL BLEND (CALCIUM CARBONATE, VITAMIN E ACETATE, VITAMIN A PALMITATE, VITAMIN D2), SEA SALT, SUNFLOWER LECITHIN, LOCUST BEAN GUM, GELLAN GUM.)",
        "allergen_attributes" : {
                "allergen_statement_not_available" : null,
                "contains_shellfish" : "NO",
                "contains_peanut" : "NO",
                "contains_tree_nuts" : "YES",
                "contains_milk" : "NO",
                "contains_wheat" : "NO",
                "contains_soy" : "NO",
                "contains_eggs" : "NO",
                "contains_fish" : "NO",
                "contains_added_msg" : "UNKNOWN",
                "contains_hfcs" : "UNKNOWN",
                "contains_mustard" : "UNKNOWN",
                "contains_celery" : "UNKNOWN",
                "contains_sesame" : "UNKNOWN",
                "contains_red_yellow_blue_dye" : "UNKNOWN",
                "gluten_free_per_fda" : "UNKNOWN",
                "non_gmo_claim" : "UNKNOWN",
                "contains_gluten" : "NO"
        },
        "dietary_attributes" : {
                "vegan" : "YES",
                "vegetarian" : "YES",
                "kosher" : "YES",
                "halal" : "UNKNOWN"
        },
        "primary_attributes" : {
                "protein" : 7.543,
                "total_fat" : 19.022,
                "carbohydrate" : 69.196,
                "calories" : 463.227,
                "total_sugars" : 61.285,
                "fiber" : 5.81,
                "calcium" : 3840.228,
                "iron" : 3.955,
                "potassium" : 270.768,
                "sodium" : 1351.208,
                "cholesterol" : 0,
                "trans_fat" : 0,
                "saturated_fat" : 1.488,
                "monounsaturated_fat" : 11.743,
                "polyunsaturated_fat" : 4.832,
                "calories_from_fat" : 171.195,
                "pct_calories_from_fat" : 36.957,
                "pct_calories_from_saturated_fat" : 2.892,
                "added_sugars" : null,
                "vitamin_d_(mcg)" : null
        },
        "secondary_attributes" : {
                "ash" : null,
                "water" : null,
                "magnesium" : 120.654,
                "phosphorous" : 171.215,
                "zinc" : 1.019,
                "copper" : 0.183,
                "manganese" : null,
                "selenium" : 1.325,
                "vitamin_a_(IU)" : 5331.357,
                "vitamin_a_(RAE)" : null,
                "beta_carotene" : null,
                "alpha_carotene" : null,
                "vitamin_e_(A-tocopherol)" : 49.909,
                "vitamin_d_(IU)" : null,
                "vitamin_c" : 0,
                "thiamin_(B1)" : 0,
                "riboflavin_(B2)" : 0.449,
                "niacin" : 0.979,
                "pantothenic_acid" : 0.061,
                "vitamin_b6" : 0,
                "folacin_(folic_acid)" : null,
                "vitamin_b12" : 0,
                "vitamin_k" : null,
                "folic_acid" : null,
                "folate_food" : null,
                "folate_DFE" : null,
                "vitamin_a_(RE)" : null,
                "pct_calories_from_protein" : 6.514,
                "pct_calories_from_carbohydrates" : 59.751,
                "biotin" : null,
                "niacin_(mg_NE)" : null,
                "vitamin_e_(IU)" : null
        }
}

Answer 1

在 recipe.recipe_id 字段上添加唯一索引，或者通过检查是否存在匹配文档来围绕它进行编码：

with open(import_file) as f:
    file_data = json.load(f)

for data in file_data:
    recipe_id = get('recipe_id') # Add error checking if any chance these fields don't exist
    if db.mycollection.find_one({'recipe_id': recipe_id}) is None:
        db.mycollection.insert_one(data)

如何在 python 脚本中使用 upsertField 或 upsert 参数将 json 数据导入 mongo 集合

How to use upsertField or upsert parameter in python script to import the json data in mongo collection

python

json

mongodb

pymongo

mongoimport