MongoDB 嵌套对象聚合计数
MongoDB nested object aggregation counting
我有一个高度嵌套的 mongoDB 对象集,我想计算符合给定条件的子文档的数量 编辑:(在每个文档中) .例如:
{"_id":{"chr":"20","pos":"14371","ref":"A","alt":"G"},
"studies":[
{
"study_id":"Study1",
"samples":[
{
"sample_id":"NA00001",
"formatdata":[
{"GT":"1|0","GQ":48,"DP":8,"HQ":[51,51]}
]
},
{
"sample_id":"NA00002",
"formatdata":[
{"GT":"0|0","GQ":48,"DP":8,"HQ":[51,51]}
]
}
]
}
]
}
{"_id":{"chr":"20","pos":"14372","ref":"T","alt":"AA"},
"studies":[
{
"study_id":"Study3",
"samples":[
{
"sample_id":"SAMPLE1",
"formatdata":[
{"GT":"1|0","GQ":48,"DP":8,"HQ":[51,51]}
]
},
{
"sample_id":"SAMPLE2",
"formatdata":[
{"GT":"1|0","GQ":48,"DP":8,"HQ":[51,51]}
]
}
]
}
]
}
{"_id":{"chr":"20","pos":"14373","ref":"C","alt":"A"},
"studies":[
{
"study_id":"Study3",
"samples":[
{
"sample_id":"SAMPLE3",
"formatdata":[
{"GT":"0|0","GQ":48,"DP":8,"HQ":[51,51]}
]
},
{
"sample_id":"SAMPLE7",
"formatdata":[
{"GT":"0|0","GQ":48,"DP":8,"HQ":[51,51]}
]
}
]
}
]
}
我想知道有多少个子文档包含 GT:"1|0",在本例中,第一个文档中为 1,第二个文档中为两个,第三个文档中为 0。我试过 unwind 和 aggregate 函数,但我显然没有做正确的事情。当我尝试通过 "GT" 字段计算子文档时,mongo 抱怨:
db.collection.aggregate([{$group: {"$studies.samples.formatdata.GT":1,_id:0}}])
由于我的群组名称不能包含“.”,但如果我将它们排除在外:
db.collection.aggregate([{$group: {"$GT":1,_id:0}}])
它抱怨是因为“$GT 不能是运算符名称”
有什么想法吗?
使用数组时需要处理$unwind
,需要处理3次:
db.collection.aggregate([
// Un-wind the array's to access filtering
{ "$unwind": "$studies" },
{ "$unwind": "$studies.samples" },
{ "$unwind": "$studies.samples.formdata" },
// Group results to obtain the matched count per key
{ "$group": {
"_id": "$studies.samples.formdata.GT",
"count": { "$sum": 1 }
}}
])
理想情况下,您希望过滤您的输入。可能使用 $match both before and after $unwind is processed and using a $regex 来匹配数据以“1”开头的文档。
db.collection.aggregate([
// Match first to exclude documents where this is not present in any array member
{ "$match": { "studies.samples.formdata.GT": /^1/ } },
// Un-wind the array's to access filtering
{ "$unwind": "$studies" },
{ "$unwind": "$studies.samples" },
{ "$unwind": "$studies.samples.formdata" },
// Match to filter
{ "$match": { "studies.samples.formdata.GT": /^1/ } },
// Group results to obtain the matched count per key
{ "$group": {
"_id": {
"_id": "$_id",
"key": "$studies.samples.formdata.GT"
},
"count": { "$sum": 1 }
}}
])
请注意,在所有情况下,"dollar $" 前缀条目都是指代文档属性的 "variables"。这些是 "values" 使用右侧的输入。左侧 "keys" 必须指定为纯字符串键。不能使用变量来命名键。
https://mongoplayground.net/p/DpX6cFhR_mm
db.collection.aggregate([
{
"$unwind": "$tags"
},
{
"$match": {
"$or": [
{
"tags.name": "Canada"
},
{
"tags.name": "ABC"
}
]
}
},
{
"$group": {
"_id": null,
"count": {
"$sum": 1
}
}
}
])
我有一个高度嵌套的 mongoDB 对象集,我想计算符合给定条件的子文档的数量 编辑:(在每个文档中) .例如:
{"_id":{"chr":"20","pos":"14371","ref":"A","alt":"G"},
"studies":[
{
"study_id":"Study1",
"samples":[
{
"sample_id":"NA00001",
"formatdata":[
{"GT":"1|0","GQ":48,"DP":8,"HQ":[51,51]}
]
},
{
"sample_id":"NA00002",
"formatdata":[
{"GT":"0|0","GQ":48,"DP":8,"HQ":[51,51]}
]
}
]
}
]
}
{"_id":{"chr":"20","pos":"14372","ref":"T","alt":"AA"},
"studies":[
{
"study_id":"Study3",
"samples":[
{
"sample_id":"SAMPLE1",
"formatdata":[
{"GT":"1|0","GQ":48,"DP":8,"HQ":[51,51]}
]
},
{
"sample_id":"SAMPLE2",
"formatdata":[
{"GT":"1|0","GQ":48,"DP":8,"HQ":[51,51]}
]
}
]
}
]
}
{"_id":{"chr":"20","pos":"14373","ref":"C","alt":"A"},
"studies":[
{
"study_id":"Study3",
"samples":[
{
"sample_id":"SAMPLE3",
"formatdata":[
{"GT":"0|0","GQ":48,"DP":8,"HQ":[51,51]}
]
},
{
"sample_id":"SAMPLE7",
"formatdata":[
{"GT":"0|0","GQ":48,"DP":8,"HQ":[51,51]}
]
}
]
}
]
}
我想知道有多少个子文档包含 GT:"1|0",在本例中,第一个文档中为 1,第二个文档中为两个,第三个文档中为 0。我试过 unwind 和 aggregate 函数,但我显然没有做正确的事情。当我尝试通过 "GT" 字段计算子文档时,mongo 抱怨:
db.collection.aggregate([{$group: {"$studies.samples.formatdata.GT":1,_id:0}}])
由于我的群组名称不能包含“.”,但如果我将它们排除在外:
db.collection.aggregate([{$group: {"$GT":1,_id:0}}])
它抱怨是因为“$GT 不能是运算符名称”
有什么想法吗?
使用数组时需要处理$unwind
,需要处理3次:
db.collection.aggregate([
// Un-wind the array's to access filtering
{ "$unwind": "$studies" },
{ "$unwind": "$studies.samples" },
{ "$unwind": "$studies.samples.formdata" },
// Group results to obtain the matched count per key
{ "$group": {
"_id": "$studies.samples.formdata.GT",
"count": { "$sum": 1 }
}}
])
理想情况下,您希望过滤您的输入。可能使用 $match both before and after $unwind is processed and using a $regex 来匹配数据以“1”开头的文档。
db.collection.aggregate([
// Match first to exclude documents where this is not present in any array member
{ "$match": { "studies.samples.formdata.GT": /^1/ } },
// Un-wind the array's to access filtering
{ "$unwind": "$studies" },
{ "$unwind": "$studies.samples" },
{ "$unwind": "$studies.samples.formdata" },
// Match to filter
{ "$match": { "studies.samples.formdata.GT": /^1/ } },
// Group results to obtain the matched count per key
{ "$group": {
"_id": {
"_id": "$_id",
"key": "$studies.samples.formdata.GT"
},
"count": { "$sum": 1 }
}}
])
请注意,在所有情况下,"dollar $" 前缀条目都是指代文档属性的 "variables"。这些是 "values" 使用右侧的输入。左侧 "keys" 必须指定为纯字符串键。不能使用变量来命名键。
https://mongoplayground.net/p/DpX6cFhR_mm
db.collection.aggregate([
{
"$unwind": "$tags"
},
{
"$match": {
"$or": [
{
"tags.name": "Canada"
},
{
"tags.name": "ABC"
}
]
}
},
{
"$group": {
"_id": null,
"count": {
"$sum": 1
}
}
}
])