考虑到给定字段中的每个子字段,如何计算具有最大唯一值的文档数量?
How to count number of documents that have a maximum unique value considering every subfield within a given field?
问题
鉴于此结构:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": c
},
{
"val1.2.2": d
}
]
}
]
}
}
考虑到给定字段中每个“子字段”中的所有“val”,我如何编写一个查询来计算具有最大唯一“val”的文档的数量?
需要考虑的事实:
- “val”是数组中的元素
- “子字段”也是数组中的一个元素
- 所有文档的“字段”、“子字段”和“val”字段名称相同
- 可能有 1 个或多个“val”
我对 NoSQL 有点陌生。在正常的 SQL 中,我可能会通过自连接来解决这个问题,但是在这里,即使可以使用 Aggregation,我也找不到方法来获得接近真正的解决方案。
案例
鉴于 a 是最大值...
此文档应该 被计算在内:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": c
},
{
"val1.2.2": d
}
]
}
]
}
}
这份文件不应该计算在内:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": a
}
]
},
{
"subfield1.2": [
{
"val1.2.1": b
},
{
"val1.2.2": c
}
]
}
]
}
}
此文档不应该计算为:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": a
},
{
"val1.2.2": c
}
]
}
]
}
}
这份文件应该被计算在内(即使b被重复):
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": b
},
{
"val1.2.2": c
}
]
}
]
}
}
欢迎任何想法。谢谢!
聚合绝对是您在这里所需要的。它可能需要一些修改,但像这样的东西可能(希望)对你有用:
db.collection.aggregate([
/*
Step 1: We need to unravel the multi-dimensional array first, otherwise we can't efficiently search for globally unique maximums.
*/
// Unravel the outer array first.
{$unwind: "$data.field1"},
// Simplifies the representation of the unwind result so that we have a flat field path rather than a nested one.
{$project: {
vals: "$data.field1.subfield1"
}},
// Now unravel the inner array.
{$unwind: "$vals"},
// Another simplification step for the same reason as the previous projection.
{$project: {
val: "$vals.val1"
}},
/*
Step 2: We want to create counts for array elements that are the same value from the same source document.
*/
// Creating the counts is easy--simply group by documents with the same source document ID and the same value, adding 1 to our total for each entry.
{$group: {
_id: {
_id: "$_id",
val: "$val"
},
count: {$sum: 1}
}},
/*
Step 3: Once we have our counts, can retrieve the maximum value for each source document.
*/
// First, sort by descending value so that the maximum value is the first we encounter.
{$sort: {
"_id.val": -1
}},
// With the entries in descending order, we can grab the first entry for each source document, which will give us all of the maximums.
{$group: {
_id: "$_id._id",
max: {
$first: {
val: "$_id.val",
count: "$count"
}
}
}},
// Just for kicks, let's simplify once more by unnesting the data.
{$project: {
val: "$max.val",
count: "$max.count"
}},
/*
Step 4: Now we just need to limit our results.
*/
// Any result with a count of 1 is a unique maximum.
{$match: {
count: 1
}}
])
这无疑是一个复杂的查询,并且在不了解您的实际文档结构的情况下难以确保功能。也就是说,评论中应该有足够的信息来帮助您修改它以满足您的需要。但是,如果您 运行 遇到任何问题,请告诉我,我会尽我所能帮助您解决问题。
只是想 post 一个不同的解决方案,因为这个 运行 比使用聚合框架的那个稍微快一点(在我使用的数据库中);这是一个 JavaScript 解决方案。
use myDB;
// Function that determines if a "field" has a unique maximum value.
function validate(list){
let len = list.length;
let isGood = false;
if(len == 0){
isGood = false;
}
else if (len == 1){
isGood = true;
}
else{
isGood = list[0] != list[1];
}
return isGood;
}
// These function iterates over all the "values" in every "subfield"
// within a "field" of a document.
// They add possible maximum values to a list which is then
// validated in "validate()".
function verifySubfields(field){
let list = [];
field.forEach(fieldElement => {
// Check if subfield exists within the element and
// check that is not empty
if (fieldElement.subfield && fieldElement.subfield[0]){
let subfield = fieldElement.subfield;
subfield.forEach(subfieldElement => {
let val = subfieldElement.val;
if (list.length == 0){
list.push(val);
}
else{
if (a >= list[0]){
list.unshift(val);
}
}
});
}
});
return validate(list);
}
function verifyField(doc){
return verifySubfields(doc.data.field);
};
let cont = 0, iterations = 0;
db.myCollection.find({ "data.field.subfield": {$exists: true} }).forEach(doc => {
iterations++;
if(verifyField(doc)){
cont++;
}
});
print(`\nTotal: ${iterations} \nPositives: ${cont} \n`);
注:运行 使用 mongo < myFile.js
.
检查解决方案的评论中提到的问题可以通过对 "verifySubfields()" 进行更多调用并在 "verifyField()" 中验证这些结果来解决,这可能有名称更改为 "verifyFields()".
问题
鉴于此结构:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": c
},
{
"val1.2.2": d
}
]
}
]
}
}
考虑到给定字段中每个“子字段”中的所有“val”,我如何编写一个查询来计算具有最大唯一“val”的文档的数量?
需要考虑的事实:
- “val”是数组中的元素
- “子字段”也是数组中的一个元素
- 所有文档的“字段”、“子字段”和“val”字段名称相同
- 可能有 1 个或多个“val”
我对 NoSQL 有点陌生。在正常的 SQL 中,我可能会通过自连接来解决这个问题,但是在这里,即使可以使用 Aggregation,我也找不到方法来获得接近真正的解决方案。
案例
鉴于 a 是最大值... 此文档应该 被计算在内:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": c
},
{
"val1.2.2": d
}
]
}
]
}
}
这份文件不应该计算在内:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": a
}
]
},
{
"subfield1.2": [
{
"val1.2.1": b
},
{
"val1.2.2": c
}
]
}
]
}
}
此文档不应该计算为:
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": a
},
{
"val1.2.2": c
}
]
}
]
}
}
这份文件应该被计算在内(即使b被重复):
{
"_id": ObjectId("56n606c39q49b80adfe6d17b")
"data": {
"field1": [
{
"subfield1.1": [
{
"val1.1.1": a
},
{
"val1.1.2": b
}
]
},
{
"subfield1.2": [
{
"val1.2.1": b
},
{
"val1.2.2": c
}
]
}
]
}
}
欢迎任何想法。谢谢!
聚合绝对是您在这里所需要的。它可能需要一些修改,但像这样的东西可能(希望)对你有用:
db.collection.aggregate([
/*
Step 1: We need to unravel the multi-dimensional array first, otherwise we can't efficiently search for globally unique maximums.
*/
// Unravel the outer array first.
{$unwind: "$data.field1"},
// Simplifies the representation of the unwind result so that we have a flat field path rather than a nested one.
{$project: {
vals: "$data.field1.subfield1"
}},
// Now unravel the inner array.
{$unwind: "$vals"},
// Another simplification step for the same reason as the previous projection.
{$project: {
val: "$vals.val1"
}},
/*
Step 2: We want to create counts for array elements that are the same value from the same source document.
*/
// Creating the counts is easy--simply group by documents with the same source document ID and the same value, adding 1 to our total for each entry.
{$group: {
_id: {
_id: "$_id",
val: "$val"
},
count: {$sum: 1}
}},
/*
Step 3: Once we have our counts, can retrieve the maximum value for each source document.
*/
// First, sort by descending value so that the maximum value is the first we encounter.
{$sort: {
"_id.val": -1
}},
// With the entries in descending order, we can grab the first entry for each source document, which will give us all of the maximums.
{$group: {
_id: "$_id._id",
max: {
$first: {
val: "$_id.val",
count: "$count"
}
}
}},
// Just for kicks, let's simplify once more by unnesting the data.
{$project: {
val: "$max.val",
count: "$max.count"
}},
/*
Step 4: Now we just need to limit our results.
*/
// Any result with a count of 1 is a unique maximum.
{$match: {
count: 1
}}
])
这无疑是一个复杂的查询,并且在不了解您的实际文档结构的情况下难以确保功能。也就是说,评论中应该有足够的信息来帮助您修改它以满足您的需要。但是,如果您 运行 遇到任何问题,请告诉我,我会尽我所能帮助您解决问题。
只是想 post 一个不同的解决方案,因为这个 运行 比使用聚合框架的那个稍微快一点(在我使用的数据库中);这是一个 JavaScript 解决方案。
use myDB;
// Function that determines if a "field" has a unique maximum value.
function validate(list){
let len = list.length;
let isGood = false;
if(len == 0){
isGood = false;
}
else if (len == 1){
isGood = true;
}
else{
isGood = list[0] != list[1];
}
return isGood;
}
// These function iterates over all the "values" in every "subfield"
// within a "field" of a document.
// They add possible maximum values to a list which is then
// validated in "validate()".
function verifySubfields(field){
let list = [];
field.forEach(fieldElement => {
// Check if subfield exists within the element and
// check that is not empty
if (fieldElement.subfield && fieldElement.subfield[0]){
let subfield = fieldElement.subfield;
subfield.forEach(subfieldElement => {
let val = subfieldElement.val;
if (list.length == 0){
list.push(val);
}
else{
if (a >= list[0]){
list.unshift(val);
}
}
});
}
});
return validate(list);
}
function verifyField(doc){
return verifySubfields(doc.data.field);
};
let cont = 0, iterations = 0;
db.myCollection.find({ "data.field.subfield": {$exists: true} }).forEach(doc => {
iterations++;
if(verifyField(doc)){
cont++;
}
});
print(`\nTotal: ${iterations} \nPositives: ${cont} \n`);
注:运行 使用 mongo < myFile.js
.
检查解决方案的评论中提到的问题可以通过对 "verifySubfields()" 进行更多调用并在 "verifyField()" 中验证这些结果来解决,这可能有名称更改为 "verifyFields()".