如何在mongoquery中展开两个数组

How to unwind two arrays in mongoquery

我在 mongodb 中有这样一个集合:

{
"_id" : ObjectId("5490a00879dc6a138dcefb0f"),
"Date" : 20141012,
"Type" : "Twitter",
"Entities" : [ 
    {
        "ID" : 2,
        "Name" : "test1",
        "Sentiment" : {
            "Value" : 0.1,
            "Neutral" : 12
        }
     }
],
"Topics" : [ 
    {
        "ID" : 1,
        "Name" : "Test2",
        "Sentiment" : {
            "Value" : 0.5,
            "Neutral" : 1
        }
    }
]
}

现在我需要展开主题和实体数组,然后我想按日期分组并对情绪的所有值求和,所以我按如下方式进行:

    DBObject unwind = new BasicDBObject("$unwind", "$Entities"); 
    unwind.put("$unwind", "$Topics");
    collectionG = db.getCollection("GraphDataCollection");
    DBObject groupFields = new BasicDBObject( "_id", "$Date");
    groupFields.put("value", new BasicDBObject( "$sum", "$Entities.Sentiment.Value"));
    DBObject groupBy = new BasicDBObject("$group", groupFields );
    AggregationOutput output = collectionG.aggregate(where,unwind, groupBy);

现在的问题是,对于情绪值的总和,只返回 0,但如果我删除以下行:

    unwind.put("$unwind", "$Topics");

它工作正常所以我的问题是如何用一个聚合展开两个数组?

更新:

我更改了我的代码如下:

DBObject unwind = new BasicDBObject("$unwind", "$Entities"); // "$unwind" converts object with array into many duplicate objects, each with one from array
    DBObject unwindT = new BasicDBObject("$unwind", "$Topics"); // "$unwind" converts object with array into many duplicate objects, each with one from array
    collectionG = db.getCollection("GraphDataCollection");
    DBObject groupFields = new BasicDBObject( "_id", "$Date");
   groupFields.put("value", new BasicDBObject( "$sum", "$Entities.Sentiment.Value"));
    groupFields.put("value1", new BasicDBObject( "$sum", "$Topics.Sentiment.Value"));
    DBObject groupBy = new BasicDBObject("$group", groupFields );
    List<DBObject> pipeline = Arrays.asList(unwind, unwindT);
    DBObject sort = new BasicDBObject("$sort", new BasicDBObject("_id", 1));
    AggregationOutput output = collectionG.aggregate(where,unwind,unwindT, groupBy,sort);

但问题是,一旦我添加以下行:

groupFields.put("value1", new BasicDBObject( "$sum", "$Topics.Sentiment.Value"));

返回的 value1 和 value 的数字不正确,我认为我没有正确展开。有人可以帮忙吗?

这是 mongo 查询(不是 java):

// if you want the sum of Entities and Topics together 
db.test.aggregate(
   [
     {
         $unwind : '$Entities'
     },
     {
         $unwind : '$Topics'
     },     
     {
       $group:
         {
           _id: {'Date' : '$Date'},
           sum: { $sum: { $add : ['$Entities.Sentiment.Value', '$Topics.Sentiment.Value']} }
         }
     }
   ]
)

// if you want separated sum
db.test.aggregate(
   [
     {
         $unwind : '$Entities'
     },
     {
         $unwind : '$Topics'
     },     
     {
       $group:
         {
           _id: {'Date' : '$Date'},
           value1: { $sum: '$Entities.Sentiment.Value'},
           value2: { $sum: '$Topics.Sentiment.Value'}
         }
     }
   ]
)

Java:

// if you want the sum of Entities and Topics together 
private static void sumOfTopicsAndEntities(DBCollection coll) {
    DBObject unwind1 = new BasicDBObject("$unwind", "$Entities");
    DBObject unwind2 = new BasicDBObject("$unwind", "$Topics");

    // Now the $group operation
    ArrayList fileds = new ArrayList();
    fileds.add("$Entities.Sentiment.Value");
    fileds.add("$Topics.Sentiment.Value");

    DBObject groupFields = new BasicDBObject( "_id", "$Date");
    BasicDBObject add = new BasicDBObject( "$add", fileds);

    groupFields.put("sum", new BasicDBObject( "$sum", add));
    DBObject group = new BasicDBObject("$group", groupFields);

    // run aggregation
    AggregationOutput output = coll.aggregate(unwind1, unwind2, group);

    // result: { "serverUsed" : "/127.0.0.1:27017" , "result" : [ { "_id" : 2.0141012E7 , "sum" : 0.6}] , "ok" : 1.0}
    System.out.println(output);
}

// if you want separated sum
private static void seperatedValues(DBCollection coll) {
    DBObject unwind1 = new BasicDBObject("$unwind", "$Entities");
    DBObject unwind2 = new BasicDBObject("$unwind", "$Topics");

    // Now the $group operation
    DBObject groupFields = new BasicDBObject( "_id", "$Date");      
    groupFields.put("value1", new BasicDBObject( "$sum", "$Entities.Sentiment.Value"));
    groupFields.put("value2", new BasicDBObject( "$sum", "$Topics.Sentiment.Value"));
    DBObject group = new BasicDBObject("$group", groupFields);

    // run aggregation
    AggregationOutput output = coll.aggregate(unwind1, unwind2, group);

   // result: { "serverUsed" : "/127.0.0.1:27017" , "result" : [ { "_id" : 2.0141012E7 , "value1" : 0.1 , "value2" : 0.5}] , "ok" : 1.0}
    System.out.println(output);
}

这是一个容易出错的查询,因为大多数事情都包含在细节中,您应该彻底测试。好的测试用例的一个很好的来源是不同条件下的不同数据,这里明显的错误是作为样本,每个数组只显示一个数组项。

在现实世界中,这些字段之所以是数组,是因为您打算在其中包含多个条目。出于这个原因,简单地处理两个 $unwind 管道阶段是行不通的,因为它会将每个文档的第一个数组中的项目数乘以第二个数组中的项目数。

因此,考虑到这一点的更好的测试数据表示如下:

{
    "_id" : ObjectId("5490a00879dc6a138dcefb0f"),
    "Date" : 20141012,
    "Type" : "Twitter",
    "Entities" : [
            {
                    "ID" : 2,
                    "Name" : "test1",
                    "Sentiment" : {
                            "Value" : 0.1,
                            "Neutral" : 12
                    }
            }
    ],
    "Topics" : [
            {
                    "ID" : 1,
                    "Name" : "Test2",
                    "Sentiment" : {
                            "Value" : 0.5,
                            "Neutral" : 1
                    }
            },
            {
                    "ID" : 3,
                    "Name" : "Test3",
                    "Sentiment" : {
                            "Value" : 0.4,
                            "Neutral" : 1
                    }
            }
    ]
}

要对文档中的两个数组正确执行此操作,您需要按类型区分条目并仅添加特定成员。先注释JSON连载形式,方便阅读:

[
    // Unwind both arrays, produces duplicates
    { "$unwind": "$Entities" },
    { "$unwind": "$Topics" },

    // Add another field to discern type as an array
    { "$project": {
        "Date": 1,
        "Entities": 1,
        "Topics": 1,
        "select": { "$literal": [ "E", "T" ] }
    }},

    // Unwind that array as well
    { "$unwind": "$select" },


    // Group in documents by individual array ID values and per select condition
    // makes everything unique again
    { "$group": {
        "_id": {
            "_id": "$_id",
            "Date": "$Date",
            "innerId": {
               "$cond": [
                   { "$eq": [ "$select", "E" ] },
                   "$Entities.ID",
                   "$Topics.ID"
               ]
            }
        },
        "value": {
            "$first": {
                "$cond": [
                   { "$eq": [ "$select", "E" ] },
                   "$Entities.Sentiment.Value",
                   "$Topics.Sentiment.Value"
                ]
            }
        }
    }},

    //Now just sum the values per date grouping
    { "$group": {
        "_id": "$_id.Date",
        "value": { "$sum": "$value" }
    }}
])

还有另一种稍微冗长的方法,但我认为内部数组 "ID" 字段值是唯一的,至少在文档中是这样,这应该没问题。整个过程实际上是将两个单独的文档属性合并到一个单一字段中,并处理这些是数组的事实。

因此,您将数组分开,用替代类型标记每个文档,然后再次复制它们。现在对于本质上是每个文档和每个数组成员的内容,您检查匹配类型并从适当的数组值中选择。在这一点上,每个数组成员现在有一个文档和一个单一的 "value" 字段,根据选择的字段保存 *.Sentiment.Value 中的相应值,但总的来说现在所有的值都在那里并且没有重复。他们所做的只是对结果的值字段求和。

事实上,这里要学习的主要课程是您应该首先将其记录为单个数组,结构如下:

{
    "_id" : ObjectId("5490a00879dc6a138dcefb0f"),
    "Date" : 20141012,
    "Type" : "Twitter",
    "Data" : [
            {
                    "ID" : 2,
                    "Name" : "test1",
                    "Sentiment" : {
                            "Value" : 0.1,
                            "Neutral" : 12
                    },
                    "Class": "Entity"
            },
            {
                    "ID" : 1,
                    "Name" : "Test2",
                    "Sentiment" : {
                            "Value" : 0.5,
                            "Neutral" : 1
                    },
                    "Class": "Topic"
            },
            {
                    "ID" : 3,
                    "Name" : "Test3",
                    "Sentiment" : {
                            "Value" : 0.4,
                            "Neutral" : 1
                    },
                    "Class": "Topic"
            }
    ]
}

这将是一个简单的问题,只需在单个数组上处理 $unwind 一次,然后将所有值相加即可。如果您想单独处理数据 "Class",那么您可以过滤它或使用条件语句。但是对于大多数操作来说,以这种方式简单地构建起来要容易得多。

将其翻译成 Java 很简单,但以防万一您在过程中迷失方向:

    DBObject unwind1 = new BasicDBObject("$unwind", "$Entities");
    DBObject unwind2 = new BasicDBObject("$unwind", "$Topics");

    DBObject project = new BasicDBObject("$project",
        new BasicDBObject( "Date", 1 )
            .append( "Entities", 1)
            .append( "Topics", 1)
            .append( "select", 
                new BasicDBObject( "$literal", new String[]{ "E", "T" })
            )
        );

    DBObject unwind3 = new BasicDBObject("$unwind", "select");

    DBObject group1 = new BasicDBObject("$group",
        new BasicDBObject("_id",
           new BasicDBObject("_id","$_id")
                .append("Date", "$Date")
                .append("innerId",
                    new BasicDBObject("$cond",
                        new Object[]{
                            new BasicDBObject("$eq", new String[]{"$select", "E"}),
                            "$Entities.ID",
                            "$Topics.ID"
                        }
                    )
                )
        )
        .append("value",
            new BasicDBObject("$first",
                new BasicDBObject("$cond",
                    new Object[]{
                        new BasicDBObject("$eq", new String[]{"$select", "E"}),
                        "$Entities.Sentiment.Value",
                        "$Topics.Sentiment.Value"
                    }
                )
            )
        )
    );

    DBObject group2 = new BasicDBObject("$group",
        new BasicDBObject("_id", "$_id.Date")
            .append("value", new BasicDBObject("$sum","$value"))
    );

    AggregationOutput output = coll.aggregate(unwind1,unwind2,project,unwind3,group1,group2);

再补充一点。虽然你现在可能应该知道了,但 $literal 运算符是在 MongoDB 2.6 和更高版本中引入的。对于较早的服务器版本,有一个未记录的 $const 运算符实际上是同一件事。如果这可能 运行 与 MongoDB.

的早期服务器版本互换代码

另一种方法,

  • Unwind Entities 数组。
  • Group 通过 _id,得到 Entitiessum
  • Unwind Topics 数组。
  • Group乘以_id,得到Topics.
  • 的总和
  • Project 显示 topicsentities 情绪总和的字段 值。
  • Group乘以Date得到净额。

这样,每个管道中的文档数量最少,并且不会涉及太多的自连接。

聚合码:

db.collection.aggregate([
{$unwind:"$Entities"},
{$group:{"_id":"$_id",
         "Date":{$first:"$Date"},
         "Topics":{$first:"$Topics"},
         "EntitiesSum":{$sum:"$Entities.Sentiment.Value"}}},
{$unwind:"$Topics"},
{$group:{"_id":"$_id",
         "Date":{$first:"$Date"},
         "EntitiesSum":{$first:"$EntitiesSum"},
         "TopicsSum":{$sum:"$Topics.Sentiment.Value"}}},
{$project:{"_id":0,"Date":1,"EntitiesSum":1,"TopicsSum":1,
           "indSum":{$add:["$EntitiesSum","$TopicsSum"]}}},
{$group:{"_id":"$Date",
         "EntitiesSentimentSum":{$sum:"$EntitiesSum"},
         "TopicsSentimentSum":{$sum:"$TopicsSum"},
         "netSentimentSum":{$sum:"$indSum"}}}
])

Java等价物:

     DBObject unwindEntities = new BasicDBObject("$unwind","$Entities");

     DBObject groupSameIdEntities = new BasicDBObject("_id","$_id");
     groupSameIdEntities.put("Date", new BasicDBObject("$first","$Date"));
     groupSameIdEntities.put("Topics", new BasicDBObject("$first","$Topics"));
     groupSameIdEntities.put("EntitiesSum", 
                    new BasicDBObject("$sum","$Entities.Sentiment.Value"));


     DBObject unwindTopics = new BasicDBObject("$unwind","$Topics");

     DBObject groupSameIdTopics = new BasicDBObject("_id","$_id");
     groupSameIdTopics.put("Date", new BasicDBObject("$first","$Date"));
     groupSameIdTopics.put("EntitiesSum", 
                         new BasicDBObject("$first","$EntitiesSum"));
     groupSameIdTopics.put("TopicsSum",
                        new BasicDBObject("$sum","$Topics.Sentiment.Value"));

     DBObject project = new BasicDBObject("_id",0);
     project.put("Date",1);
     project.put("EntitiesSum",1);
     project.put("TopicsSum",1);
     project.put("netSumPerId",
             new BasicDBObject("$add",
                   new String[]{"$EntitiesSum","$TopicsSum"}));

     DBObject groupByDate = new BasicDBObject("_id","$Date");
     groupByDate.put("EntitiesSentimentSum", 
                     new BasicDBObject("$sum","$EntitiesSum"));
     groupByDate.put("TopicsSentimentSum", 
                     new BasicDBObject("$sum","$TopicsSum"));
     groupByDate.put("netSentimentSum", 
                      new BasicDBObject("$sum","$netSumPerId"));

     AggregationOutput output = col.aggregate(unwindEntities,
                                new BasicDBObject("$group",
                                             groupSameIdEntities),
                                unwindTopics,
                                new BasicDBObject("$group",groupSameIdTopics),
                                new BasicDBObject("$project",project),
                                new BasicDBObject("$group",groupByDate));

示例o/p(有两个文档):

{ "_id" : 2.0141012E7, 
"EntitiesSentimentSum" : 0.30000000000000004 ,
"TopicsSentimentSum" : 1.2 , 
"netSentimentSum" : 1.5}

将日期字段保存为 ISODate()