MongoDB: 查找具有相似日期的子文档实例
MongoDB: find subdocument instances with similar date
我有一个 mongodb 集合,其结构类似于
[
{
name: "name1",
instances: [{value:1, score:2, date:<ISODate>},
{value:2, score:5, date:<ISODate>},
{value:2.5, score:9, date:<ISODate>},
...]
},
{
name: "name2",
instances: [{value:6, score:3, date:<ISODate>},
{value:1, score:6, date:<ISODate>},
{value:3.7, score:5.2, date:<ISODate>},
...]
},
...
]
我想查找是否有两个(或更多)相同 name
实例的日期来自同一天,以及 return 这些实例。
稍后我想删除其中一个实例以外的所有实例,但作为开始,我希望能够找到它们。
我试过按日期聚合和分组,但无法弄清楚如何只比较一天(而不是整个日期)。
如果我理解得很好,您应该 $unwind
and then $group
by date and instance, filtering out 一组只有一个文档。类似的东西(我现在无法访问 MongoDB——注意打字错误):
db.coll.aggregate([
{$unwind: "$instances"},
{$group: { _id: { name:"$name", day:{$dayOfYear:"$date"}, year:{$year:"$date"}}, count: {$sum: 1} }},
{$match: {count: {$gt: 1}}}
])
假设您在测试集合中插入了以下测试文档以用于演示目的:
db.test.insert([
{
"name" : "name1",
"instances" : [
{
"value" : 1,
"score" : 2,
"date" : ISODate("2015-03-04T00:00:00.000Z")
},
{
"value" : 2,
"score" : 5,
"date" : ISODate("2015-04-01T00:00:00.000Z")
},
{
"value" : 2.5,
"score" : 9,
"date" : ISODate("2015-03-05T00:00:00.000Z")
}
]
},
{
"name" : "name2",
"instances" : [
{
"value" : 6,
"score" : 3,
"date" : ISODate("2015-03-05T00:00:00.000Z")
},
{
"value" : 1,
"score" : 6,
"date" : ISODate("2015-03-04T00:00:00.000Z")
},
{
"value" : 3.7,
"score" : 5.2,
"date" : ISODate("2015-02-04T00:00:00.000Z")
}
]
},
{
"name" : "name1",
"instances" : [
{
"value" : 6,
"score" : 3,
"date" : ISODate("2015-03-05T00:00:00.000Z")
},
{
"value" : 1,
"score" : 6,
"date" : ISODate("2015-03-04T00:00:00.000Z")
},
{
"value" : 3.7,
"score" : 5.2,
"date" : ISODate("2015-02-04T00:00:00.000Z")
}
]
}
])
那么下面的聚合就可以完成这项工作:
var pipeline = aggregate([
{
"$unwind": "$instances"
},
{
"$group": {
"_id": {
"name": "$name",
"year": {
"$year": "$instances.date"
},
"month": {
"$month": "$instances.date"
},
"day": {
"$dayOfYear": "$instances.date"
}
},
"count": {
"$sum": 1
},
"data": {
"$addToSet": "$$ROOT"
}
}
},
{
"$match": {
"count": {
"$gt": 1
}
}
},
{
"$unwind": "$data"
},
{
"$group": {
"_id": {
"name": "$data.name",
"_id": "$data._id"
}
}
},
{
"$project": {
"_id": "$_id._id",
"name": "$_id.name"
}
}
]);
db.test.aggregate(pipeline);
输出:
/* 0 */
{
"result" : [
{
"_id" : ObjectId("55506d0a180e849972939056"),
"name" : "name1"
},
{
"_id" : ObjectId("55506d0a180e849972939058"),
"name" : "name1"
}
],
"ok" : 1
}
上面的聚合管道有一个 $unwind
操作作为第一步,它从输入文档中解构 instances
数组字段,为每个元素输出一个文档。每个输出文档用一个元素值替换数组。
下一个流水线阶段$group
groups the documents by the "name"
, "instances.date"
fields (the date field is split into three fields by using the Date Aggregation Operators), calculates the count
field for each group, and outputs a document for each unique name
and date
(down to the day part). There is an extra array field in the group data
, which uses the system variable $$ROOT
to store the original root document, i.e. the top-level document, currently being processed in the aggregation pipeline stage. This root document is added to the array by using the $addToSet
数组运算符。
在管道的更深处,您将需要使用 $match
管道过滤那些按名称和日期分组时重复的文档,并指定计数应大于 1 的条件。
然后在 data
字段上应用另一个 $unwind
操作以提取实际的 _id
和 name
重复项,这些重复项将再次分组以进一步简化您的操作文档。
需要一个额外的 $project
管道阶段来通过修改字段来塑造您的最终文档结构。
使用聚合结果游标,然后使用forEach()
方法遍历结果并删除其他重复文档:
var cur = db.test.aggregate(pipeline);
cur.forEach(function (doc){
var count = 0;
if (count != 0){
db.test.remove({"_id": doc._id});
}
count++;
});
另一种选择是包含一个 $out
运算符作为最终管道阶段,它将聚合管道返回的文档写入指定的集合,然后您可以查询该集合并进行删除:
var cur = db.outputcollection.find();
cur.forEach(function (doc){
var count = 0;
if (count != 0){
db.test.remove({"_id": doc._id});
}
count++;
});
我有一个 mongodb 集合,其结构类似于
[
{
name: "name1",
instances: [{value:1, score:2, date:<ISODate>},
{value:2, score:5, date:<ISODate>},
{value:2.5, score:9, date:<ISODate>},
...]
},
{
name: "name2",
instances: [{value:6, score:3, date:<ISODate>},
{value:1, score:6, date:<ISODate>},
{value:3.7, score:5.2, date:<ISODate>},
...]
},
...
]
我想查找是否有两个(或更多)相同 name
实例的日期来自同一天,以及 return 这些实例。
稍后我想删除其中一个实例以外的所有实例,但作为开始,我希望能够找到它们。
我试过按日期聚合和分组,但无法弄清楚如何只比较一天(而不是整个日期)。
如果我理解得很好,您应该 $unwind
and then $group
by date and instance, filtering out 一组只有一个文档。类似的东西(我现在无法访问 MongoDB——注意打字错误):
db.coll.aggregate([
{$unwind: "$instances"},
{$group: { _id: { name:"$name", day:{$dayOfYear:"$date"}, year:{$year:"$date"}}, count: {$sum: 1} }},
{$match: {count: {$gt: 1}}}
])
假设您在测试集合中插入了以下测试文档以用于演示目的:
db.test.insert([
{
"name" : "name1",
"instances" : [
{
"value" : 1,
"score" : 2,
"date" : ISODate("2015-03-04T00:00:00.000Z")
},
{
"value" : 2,
"score" : 5,
"date" : ISODate("2015-04-01T00:00:00.000Z")
},
{
"value" : 2.5,
"score" : 9,
"date" : ISODate("2015-03-05T00:00:00.000Z")
}
]
},
{
"name" : "name2",
"instances" : [
{
"value" : 6,
"score" : 3,
"date" : ISODate("2015-03-05T00:00:00.000Z")
},
{
"value" : 1,
"score" : 6,
"date" : ISODate("2015-03-04T00:00:00.000Z")
},
{
"value" : 3.7,
"score" : 5.2,
"date" : ISODate("2015-02-04T00:00:00.000Z")
}
]
},
{
"name" : "name1",
"instances" : [
{
"value" : 6,
"score" : 3,
"date" : ISODate("2015-03-05T00:00:00.000Z")
},
{
"value" : 1,
"score" : 6,
"date" : ISODate("2015-03-04T00:00:00.000Z")
},
{
"value" : 3.7,
"score" : 5.2,
"date" : ISODate("2015-02-04T00:00:00.000Z")
}
]
}
])
那么下面的聚合就可以完成这项工作:
var pipeline = aggregate([
{
"$unwind": "$instances"
},
{
"$group": {
"_id": {
"name": "$name",
"year": {
"$year": "$instances.date"
},
"month": {
"$month": "$instances.date"
},
"day": {
"$dayOfYear": "$instances.date"
}
},
"count": {
"$sum": 1
},
"data": {
"$addToSet": "$$ROOT"
}
}
},
{
"$match": {
"count": {
"$gt": 1
}
}
},
{
"$unwind": "$data"
},
{
"$group": {
"_id": {
"name": "$data.name",
"_id": "$data._id"
}
}
},
{
"$project": {
"_id": "$_id._id",
"name": "$_id.name"
}
}
]);
db.test.aggregate(pipeline);
输出:
/* 0 */
{
"result" : [
{
"_id" : ObjectId("55506d0a180e849972939056"),
"name" : "name1"
},
{
"_id" : ObjectId("55506d0a180e849972939058"),
"name" : "name1"
}
],
"ok" : 1
}
上面的聚合管道有一个 $unwind
操作作为第一步,它从输入文档中解构 instances
数组字段,为每个元素输出一个文档。每个输出文档用一个元素值替换数组。
下一个流水线阶段$group
groups the documents by the "name"
, "instances.date"
fields (the date field is split into three fields by using the Date Aggregation Operators), calculates the count
field for each group, and outputs a document for each unique name
and date
(down to the day part). There is an extra array field in the group data
, which uses the system variable $$ROOT
to store the original root document, i.e. the top-level document, currently being processed in the aggregation pipeline stage. This root document is added to the array by using the $addToSet
数组运算符。
在管道的更深处,您将需要使用 $match
管道过滤那些按名称和日期分组时重复的文档,并指定计数应大于 1 的条件。
然后在 data
字段上应用另一个 $unwind
操作以提取实际的 _id
和 name
重复项,这些重复项将再次分组以进一步简化您的操作文档。
需要一个额外的 $project
管道阶段来通过修改字段来塑造您的最终文档结构。
使用聚合结果游标,然后使用forEach()
方法遍历结果并删除其他重复文档:
var cur = db.test.aggregate(pipeline);
cur.forEach(function (doc){
var count = 0;
if (count != 0){
db.test.remove({"_id": doc._id});
}
count++;
});
另一种选择是包含一个 $out
运算符作为最终管道阶段,它将聚合管道返回的文档写入指定的集合,然后您可以查询该集合并进行删除:
var cur = db.outputcollection.find();
cur.forEach(function (doc){
var count = 0;
if (count != 0){
db.test.remove({"_id": doc._id});
}
count++;
});