考虑到给定字段中的每个子字段，如何计算具有最大唯一值的文档数量？

Question

问题

鉴于此结构：

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": b
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": c
                    },
                    {
                        "val1.2.2": d
                    }
                ]
            }
        ]
    }
}

考虑到给定字段中每个“子字段”中的所有“val”，我如何编写一个查询来计算具有最大唯一“val”的文档的数量？

需要考虑的事实：

“val”是数组中的元素
“子字段”也是数组中的一个元素
所有文档的“字段”、“子字段”和“val”字段名称相同
可能有 1 个或多个“val”

我对 NoSQL 有点陌生。在正常的 SQL 中，我可能会通过自连接来解决这个问题，但是在这里，即使可以使用 Aggregation，我也找不到方法来获得接近真正的解决方案。

案例

鉴于 a 是最大值... 此文档应该被计算在内：

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": b
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": c
                    },
                    {
                        "val1.2.2": d
                    }
                ]
            }
        ]
    }
}

这份文件不应该计算在内：

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": a
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": b
                    },
                    {
                        "val1.2.2": c
                    }
                ]
            }
        ]
    }
}

此文档不应该计算为：

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": b
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": a
                    },
                    {
                        "val1.2.2": c
                    }
                ]
            }
        ]
    }
}

这份文件应该被计算在内（即使b被重复）：

{
    "_id": ObjectId("56n606c39q49b80adfe6d17b")
    "data": {
        "field1": [
            {
                "subfield1.1": [
                    {
                        "val1.1.1": a
                    },
                    {
                        "val1.1.2": b
                    }
                ]
            },
            {
                "subfield1.2": [
                    {
                        "val1.2.1": b
                    },
                    {
                        "val1.2.2": c
                    }
                ]
            }
        ]
    }
}

欢迎任何想法。谢谢！

Answer 1

聚合绝对是您在这里所需要的。它可能需要一些修改，但像这样的东西可能（希望）对你有用：

db.collection.aggregate([
    /*
        Step 1: We need to unravel the multi-dimensional array first, otherwise we can't efficiently search for globally unique maximums.
    */

    // Unravel the outer array first.
    {$unwind: "$data.field1"},

    // Simplifies the representation of the unwind result so that we have a flat field path rather than a nested one.
    {$project: {
        vals: "$data.field1.subfield1"
    }},

    // Now unravel the inner array.
    {$unwind: "$vals"},

    // Another simplification step for the same reason as the previous projection.
    {$project: {
        val: "$vals.val1"
    }},

    /*
        Step 2: We want to create counts for array elements that are the same value from the same source document.
    */

    // Creating the counts is easy--simply group by documents with the same source document ID and the same value, adding 1 to our total for each entry.
    {$group: {
        _id: {
            _id: "$_id",
            val: "$val"
        },
        count: {$sum: 1}
    }},

    /*
        Step 3: Once we have our counts, can retrieve the maximum value for each source document.
    */

    // First, sort by descending value so that the maximum value is the first we encounter.
    {$sort: {
        "_id.val": -1
    }},

    // With the entries in descending order, we can grab the first entry for each source document, which will give us all of the maximums.
    {$group: {
        _id: "$_id._id",
        max: {
            $first: {
                val: "$_id.val",
                count: "$count"
            }
        }
    }},

    // Just for kicks, let's simplify once more by unnesting the data.
    {$project: {
        val: "$max.val",
        count: "$max.count"
    }},

    /*
        Step 4: Now we just need to limit our results.
    */

    // Any result with a count of 1 is a unique maximum.
    {$match: {
        count: 1
    }}
])

这无疑是一个复杂的查询，并且在不了解您的实际文档结构的情况下难以确保功能。也就是说，评论中应该有足够的信息来帮助您修改它以满足您的需要。但是，如果您运行遇到任何问题，请告诉我，我会尽我所能帮助您解决问题。

Answer 2

只是想 post 一个不同的解决方案，因为这个运行比使用聚合框架的那个稍微快一点（在我使用的数据库中）；这是一个 JavaScript 解决方案。

use myDB;

// Function that determines if a "field" has a unique maximum value.
function validate(list){
    let len = list.length;
    let isGood = false;
    if(len == 0){
        isGood = false;
    }
    else if (len == 1){
        isGood = true;
    }
    else{
        isGood = list[0] != list[1];
    }
    return isGood;
}

// These function iterates over all the "values" in every "subfield" 
//  within a "field" of a document.
// They add possible maximum values to a list which is then 
//  validated in "validate()".
function verifySubfields(field){
    let list = [];
    field.forEach(fieldElement => {
        // Check if subfield exists within the element and
        //  check that is not empty
        if (fieldElement.subfield && fieldElement.subfield[0]){
            let subfield = fieldElement.subfield;
            subfield.forEach(subfieldElement => {
                let val = subfieldElement.val;

                if (list.length == 0){
                    list.push(val);
                }
                else{
                    if (a >= list[0]){
                        list.unshift(val);
                    }
                }
            });
        }
    });

    return validate(list);
}

function verifyField(doc){
    return verifySubfields(doc.data.field);
};

let cont = 0, iterations = 0;
db.myCollection.find({ "data.field.subfield": {$exists: true} }).forEach(doc => { 
   iterations++; 
   if(verifyField(doc)){
      cont++;
   } 
});
print(`\nTotal: ${iterations} \nPositives: ${cont} \n`);

注：运行使用 mongo < myFile.js .

检查解决方案的评论中提到的问题可以通过对 "verifySubfields()" 进行更多调用并在 "verifyField()" 中验证这些结果来解决，这可能有名称更改为 "verifyFields()".

考虑到给定字段中的每个子字段，如何计算具有最大唯一值的文档数量？

How to count number of documents that have a maximum unique value considering every subfield within a given field?

mongodb

nosql-aggregation

问题

案例