使用 MongoDB 聚合将集合合并到固定大小
Merge collections up to a fixed size using MongoDB Aggregation
我有一个看起来像这样的集合:
{
"_id" : id1,
"field1" : 11,
"field2": 101,
"localityID" : 27
}
{
"_id" : id2,
"field1" : 22,
"field2": 202,
"localityID" : 27
}
{
"_id" : id3,
"field1" : 33,
"field2": 303,
"localityID" : 27
}
{
"_id" : id4,
"field1" : 44,
"field2": 404,
"localityID" : 27
}
{
"_id" : id5,
"field1" : 55,
"field2": 505,
"localityID" : 27
}
{
"_id" : id6,
"field1" : 66,
"field2": 606,
"localityID" : 61
}
{
"_id" : id4,
"field1" : 77,
"field2": 707,
"localityID" : 61
}
用例 - 我想检索和处理大小为 3 的批次中具有相同 localityID
的记录。
出于跟踪目的,我还想跟踪在特定批次中处理的记录
同样,我想使用 MongoDB 的聚合框架来合并具有相同 localityID
但最大为固定大小(如上所述为 3)的集合。
我想将上面的集合更新为:
{
"_id" : "id111",
"batchId" : "batch1",
"localityID": 27,
"batches": [
{
"field1" : 11,
"field2": 101
},
{
"field1" : 22,
"field2": 202
},
{
"field1" : 33,
"field2": 303
}
]
}
{
"_id" : "id222",
"batchId" : "batch2",
"localityID": 27,
"batches": [
{
"field1" : 44,
"field2": 404
},
{
"field1" : 55,
"field2": 505
}
]
}
{
"_id" : "id333",
"batchId" : "batch1",
"localityID": 61,
"batches": [
{
"field1" : 66,
"field2": 606
},
{
"field1" : 77,
"field2": 707
}
]
}
我尝试了几种聚合函数的组合,如下所示,但未能获得所需的结果。
(这能够将具有相同 localityID
的所有记录组合在一起,但仅在一个文档中,而不是批量 )
db.old_collection.aggregate([
{ "$group": { "_id": "$localityID" } },
{ "$lookup": {
"from": "old_collection",
"let": { "lid": "$_id" },
"pipeline": [
{ "$match": { "$expr": { "$eq": [ "$localityID", "$$lid" ] } }},
{ "$project": { "_id": 0, "field1": 1, "field2": 1 } }
],
"as": "batches"
}},
{"$out": "new_collection" }
])
上述聚合函数产生以下结果 -
{
"_id" : "id111",
"batchId" : "batch1",
"localityID": 27,
"batches": [
{
"field1" : 11,
"field2": 101
},
{
"field1" : 22,
"field2": 202
},
{
"field1" : 33,
"field2": 303
},
{
"field1" : 44,
"field2": 404
},
{
"field1" : 55,
"field2": 505
}
]
}
{
"_id" : "id333",
"batchId" : "batch1",
"localityID": 61,
"batches": [
{
"field1" : 66,
"field2": 606
},
{
"field1" : 77,
"field2": 707
}
]
}
使用 Mongo 的聚合框架是否可行,或者我使用其他东西会更好吗?
思路取自。
您可以使用 $range to generate an array of indexes with step parameter set to some bucketSize
. Then you just need $slice 来获取大小为 bucketSize
的数组,试试这个:
let bucketSize = 3;
db.old_collection.aggregate([
{
$group: {
_id: "$localityID",
id: { $first: "$_id" },
localityID: { $first: "$localityID" },
batches: {
$push: {
field1: "$field1",
field2: "$field2"
}
}
}
},
{
$project: {
_id: 0,
localityID: "$localityID",
batches: {
$map: {
input: { $range: [0, { $size: "$batches" }, bucketSize] },
as: "index",
in: { $slice: ["$batches", "$$index", bucketSize] }
}
}
}
},
{
$unwind: {
path: "$batches",
includeArrayIndex: "batchId"
}
},
{
$addFields: {
batchId: {
$concat: [
"batch",
{ $toString: { $add: ["$batchId", 1] } }
]
}
}
},
// $sort is optional. You can remove it if not required.
{
$sort: {
localityID: 1,
batchId: 1
}
}
{ $out: "new_collection" }
]);
输出
[
{
"_id": ObjectId("..."),
"localityID": 27,
"batches": [
{
"field1": 11,
"field2": 101
},
{
"field1": 22,
"field2": 202
},
{
"field1": 33,
"field2": 303
}
],
"batchId": "batch1"
},
{
"_id": ObjectId("..."),
"localityID": 27,
"batches": [
{
"field1": 44,
"field2": 404
},
{
"field1": 55,
"field2": 505
}
],
"batchId": "batch2"
},
{
"_id": ObjectId("..."),
"localityID": 61,
"batches": [
{
"field1": 66,
"field2": 606
},
{
"field1": 77,
"field2": 707
}
],
"batchId": "batch1"
}
]
如前所述,我不明白字段 batchId
的逻辑。除此之外,简单的解决方案可能是这个:
db.collection.aggregate([
{ $group: { _id: "$localityID", batches: { $push: { field1: "$field1", field2: "$field2" } } } },
{
$project: {
localityID: "$_id",
batches: { $slice: ["$batches", 1, 3] }
}
}
])
我有一个看起来像这样的集合:
{
"_id" : id1,
"field1" : 11,
"field2": 101,
"localityID" : 27
}
{
"_id" : id2,
"field1" : 22,
"field2": 202,
"localityID" : 27
}
{
"_id" : id3,
"field1" : 33,
"field2": 303,
"localityID" : 27
}
{
"_id" : id4,
"field1" : 44,
"field2": 404,
"localityID" : 27
}
{
"_id" : id5,
"field1" : 55,
"field2": 505,
"localityID" : 27
}
{
"_id" : id6,
"field1" : 66,
"field2": 606,
"localityID" : 61
}
{
"_id" : id4,
"field1" : 77,
"field2": 707,
"localityID" : 61
}
用例 - 我想检索和处理大小为 3 的批次中具有相同 localityID
的记录。
出于跟踪目的,我还想跟踪在特定批次中处理的记录
同样,我想使用 MongoDB 的聚合框架来合并具有相同 localityID
但最大为固定大小(如上所述为 3)的集合。
我想将上面的集合更新为:
{
"_id" : "id111",
"batchId" : "batch1",
"localityID": 27,
"batches": [
{
"field1" : 11,
"field2": 101
},
{
"field1" : 22,
"field2": 202
},
{
"field1" : 33,
"field2": 303
}
]
}
{
"_id" : "id222",
"batchId" : "batch2",
"localityID": 27,
"batches": [
{
"field1" : 44,
"field2": 404
},
{
"field1" : 55,
"field2": 505
}
]
}
{
"_id" : "id333",
"batchId" : "batch1",
"localityID": 61,
"batches": [
{
"field1" : 66,
"field2": 606
},
{
"field1" : 77,
"field2": 707
}
]
}
我尝试了几种聚合函数的组合,如下所示,但未能获得所需的结果。
(这能够将具有相同 localityID
的所有记录组合在一起,但仅在一个文档中,而不是批量 )
db.old_collection.aggregate([
{ "$group": { "_id": "$localityID" } },
{ "$lookup": {
"from": "old_collection",
"let": { "lid": "$_id" },
"pipeline": [
{ "$match": { "$expr": { "$eq": [ "$localityID", "$$lid" ] } }},
{ "$project": { "_id": 0, "field1": 1, "field2": 1 } }
],
"as": "batches"
}},
{"$out": "new_collection" }
])
上述聚合函数产生以下结果 -
{
"_id" : "id111",
"batchId" : "batch1",
"localityID": 27,
"batches": [
{
"field1" : 11,
"field2": 101
},
{
"field1" : 22,
"field2": 202
},
{
"field1" : 33,
"field2": 303
},
{
"field1" : 44,
"field2": 404
},
{
"field1" : 55,
"field2": 505
}
]
}
{
"_id" : "id333",
"batchId" : "batch1",
"localityID": 61,
"batches": [
{
"field1" : 66,
"field2": 606
},
{
"field1" : 77,
"field2": 707
}
]
}
使用 Mongo 的聚合框架是否可行,或者我使用其他东西会更好吗?
思路取自bucketSize
. Then you just need $slice 来获取大小为 bucketSize
的数组,试试这个:
let bucketSize = 3;
db.old_collection.aggregate([
{
$group: {
_id: "$localityID",
id: { $first: "$_id" },
localityID: { $first: "$localityID" },
batches: {
$push: {
field1: "$field1",
field2: "$field2"
}
}
}
},
{
$project: {
_id: 0,
localityID: "$localityID",
batches: {
$map: {
input: { $range: [0, { $size: "$batches" }, bucketSize] },
as: "index",
in: { $slice: ["$batches", "$$index", bucketSize] }
}
}
}
},
{
$unwind: {
path: "$batches",
includeArrayIndex: "batchId"
}
},
{
$addFields: {
batchId: {
$concat: [
"batch",
{ $toString: { $add: ["$batchId", 1] } }
]
}
}
},
// $sort is optional. You can remove it if not required.
{
$sort: {
localityID: 1,
batchId: 1
}
}
{ $out: "new_collection" }
]);
输出
[
{
"_id": ObjectId("..."),
"localityID": 27,
"batches": [
{
"field1": 11,
"field2": 101
},
{
"field1": 22,
"field2": 202
},
{
"field1": 33,
"field2": 303
}
],
"batchId": "batch1"
},
{
"_id": ObjectId("..."),
"localityID": 27,
"batches": [
{
"field1": 44,
"field2": 404
},
{
"field1": 55,
"field2": 505
}
],
"batchId": "batch2"
},
{
"_id": ObjectId("..."),
"localityID": 61,
"batches": [
{
"field1": 66,
"field2": 606
},
{
"field1": 77,
"field2": 707
}
],
"batchId": "batch1"
}
]
如前所述,我不明白字段 batchId
的逻辑。除此之外,简单的解决方案可能是这个:
db.collection.aggregate([
{ $group: { _id: "$localityID", batches: { $push: { field1: "$field1", field2: "$field2" } } } },
{
$project: {
localityID: "$_id",
batches: { $slice: ["$batches", 1, 3] }
}
}
])