MongoDB: 查找具有相似日期的子文档实例

MongoDB: find subdocument instances with similar date

我有一个 mongodb 集合,其结构类似于

[
  {
    name: "name1",
    instances: [{value:1, score:2, date:<ISODate>},
                {value:2, score:5, date:<ISODate>},
                {value:2.5, score:9, date:<ISODate>},
                ...]
  },
  {
    name: "name2",
    instances: [{value:6, score:3, date:<ISODate>},
                {value:1, score:6, date:<ISODate>},
                {value:3.7, score:5.2, date:<ISODate>},
                ...]
  },
  ...
]

我想查找是否有两个(或更多)相同 name 实例的日期来自同一天,以及 return 这些实例。

稍后我想删除其中一个实例以外的所有实例,但作为开始,我希望能够找到它们。

我试过按日期聚合和分组,但无法弄清楚如何只比较一天(而不是整个日期)。

如果我理解得很好,您应该 $unwind and then $group by date and instance, filtering out 一组只有一个文档。类似的东西(我现在无法访问 MongoDB——注意打字错误):

db.coll.aggregate([
  {$unwind: "$instances"},
  {$group: { _id: { name:"$name", day:{$dayOfYear:"$date"}, year:{$year:"$date"}}, count: {$sum: 1} }},
  {$match: {count: {$gt: 1}}}
])

假设您在测试集合中插入了以下测试文档以用于演示目的:

db.test.insert([
{
    "name" : "name1",
    "instances" : [ 
        {
            "value" : 1,
            "score" : 2,
            "date" : ISODate("2015-03-04T00:00:00.000Z")
        }, 
        {
            "value" : 2,
            "score" : 5,
            "date" : ISODate("2015-04-01T00:00:00.000Z")
        }, 
        {
            "value" : 2.5,
            "score" : 9,
            "date" : ISODate("2015-03-05T00:00:00.000Z")
        }
    ]
},
{
    "name" : "name2",
    "instances" : [ 
        {
            "value" : 6,
            "score" : 3,
            "date" : ISODate("2015-03-05T00:00:00.000Z")
        }, 
        {
            "value" : 1,
            "score" : 6,
            "date" : ISODate("2015-03-04T00:00:00.000Z")
        }, 
        {
            "value" : 3.7,
            "score" : 5.2,
            "date" : ISODate("2015-02-04T00:00:00.000Z")
        }
    ]
},
{
    "name" : "name1",
    "instances" : [ 
        {
            "value" : 6,
            "score" : 3,
            "date" : ISODate("2015-03-05T00:00:00.000Z")
        }, 
        {
            "value" : 1,
            "score" : 6,
            "date" : ISODate("2015-03-04T00:00:00.000Z")
        }, 
        {
            "value" : 3.7,
            "score" : 5.2,
            "date" : ISODate("2015-02-04T00:00:00.000Z")
        }
    ]
}
])

那么下面的聚合就可以完成这项工作:

var pipeline = aggregate([
    {
        "$unwind": "$instances"
    },
    {
        "$group": {
            "_id": {
                "name": "$name",
                "year": {
                    "$year": "$instances.date"
                },
                "month": {
                    "$month": "$instances.date"
                },
                "day": {
                    "$dayOfYear": "$instances.date"
                }
            },
            "count": {
                "$sum": 1
            },
            "data": {
                "$addToSet": "$$ROOT"
            }
        }
    },
    {
        "$match": {
            "count": {
                "$gt": 1
            }
        }
    },
    {
        "$unwind": "$data"
    },
    {
        "$group": {
            "_id": {
                "name": "$data.name",
                "_id": "$data._id"
            }
        }
    },
    {
        "$project": {
            "_id": "$_id._id",
            "name": "$_id.name"
        }
    }
]);
db.test.aggregate(pipeline);

输出:

/* 0 */
{
    "result" : [ 
        {
            "_id" : ObjectId("55506d0a180e849972939056"),
            "name" : "name1"
        }, 
        {
            "_id" : ObjectId("55506d0a180e849972939058"),
            "name" : "name1"
        }
    ],
    "ok" : 1
}

上面的聚合管道有一个 $unwind 操作作为第一步,它从输入文档中解构 instances 数组字段,为每个元素输出一个文档。每个输出文档用一个元素值替换数组。

下一个流水线阶段$group groups the documents by the "name", "instances.date" fields (the date field is split into three fields by using the Date Aggregation Operators), calculates the count field for each group, and outputs a document for each unique name and date (down to the day part). There is an extra array field in the group data, which uses the system variable $$ROOT to store the original root document, i.e. the top-level document, currently being processed in the aggregation pipeline stage. This root document is added to the array by using the $addToSet数组运算符。

在管道的更深处,您将需要使用 $match 管道过滤那些按名称和日期分组时重复的文档,并指定计数应大于 1 的条件。

然后在 data 字段上应用另一个 $unwind 操作以提取实际的 _idname 重复项,这些重复项将再次分组以进一步简化您的操作文档。

需要一个额外的 $project 管道阶段来通过修改字段来塑造您的最终文档结构。

使用聚合结果游标,然后使用forEach()方法遍历结果并删除其他重复文档:

var cur = db.test.aggregate(pipeline);
cur.forEach(function (doc){
    var count = 0;
    if (count != 0){
        db.test.remove({"_id": doc._id});
    }
    count++;
});

另一种选择是包含一个 $out 运算符作为最终管道阶段,它将聚合管道返回的文档写入指定的集合,然后您可以查询该集合并进行删除:

var cur = db.outputcollection.find();
cur.forEach(function (doc){
    var count = 0;
    if (count != 0){
        db.test.remove({"_id": doc._id});
    }
    count++;
});