ElasticSearch:聚合一组收集的结果

ElasticSearch: Aggregate Over a Collected Set of Results

假设我有一套……汉堡……

对于每个汉堡,我都有一组与汉堡的每个成分相关的图像。

不幸的是,这些组件的结构没有任何一致性(我没有写)。

这里有两个文件的例子:

{
    "bunsResource": {
        "image": {
            "url": "./buns_1.png",
            "who": "Sam"
        },
        "buns": [
            {
                "image": {
                    "url": "./top-bun_1.png",
                    "who": "Jim"
                }
            },
            {
                "image": {
                    "url": "./bottom-bun_1.png",
                    "who": "Sarah"
                }
            }
        ]
    },
    "pattyResource": {
        "image": {
            "url": "./patties_1.png",
            "who": "Kathy"
        },
        "patties": [
            {
                "image": {
                    "url": "./patty_1.jpg",
                    "who": "Kathy"
                }
            }
        ]
    }
},
{
    "bunsResource": {
        "image": {
            "url": "./buns_2.png",
            "who": "Jim"
        },
        "buns": [
            {
                "image": {
                    "url": "./top-bun_2.png",
                    "who": "Jim"
                }
            },
            {
                "image": {
                    "url": "./bottom-bun_2.png",
                    "who": "Kathy"
                }
            }
        ]
    },
    "pattyResource": {
        "image": {
            "url": "./patties_1.png",
            "who": "Kathy"
        },
        "patties": [
            {
                "image": {
                    "url": "./patty_1.jpg",
                    "who": "Kathy"
                }
            }
        ]
    }
}

我需要的是一套photographer / image count.

{
    "who": "Sam",
    "count": 1
},
{
    "who": "Jim",
    "count": 3
},
{
    "who": "Sarah",
    "count": 2
},
{
    "who": "Kathy",
    "count": 2
}

请注意,这是 唯一 图片计数!

我还没弄清楚如何实现这个...

我假设我需要首先将每个 burger 解析为一组唯一的 url / who,然后从那里聚合,但我无法弄清楚如何获得 [= 的扁平化列表=14=] 每个汉堡。

这取决于pattiesbuns数组是否为nested。如果不是,那么很简单,您可以简单地 运行 一个 terms 聚合,使用一个脚本从文档中的任何地方收集所有 who 字段:

POST not-nested/_search 
{
  "size": 0,
  "aggs": {
    "script": {
      "terms": {
        "script": {
          "source": """
          def list = new ArrayList();
          list.addAll(doc['pattyResource.image.who.keyword'].values);
          list.addAll(doc['bunsResource.image.who.keyword'].values);
          list.addAll(doc['bunsResource.buns.image.who.keyword'].values);
          list.addAll(doc['pattyResource.patties.image.who.keyword'].values);
          return list;
          """
        }
      }
    }
  }
}

那将 return 这个:

  "aggregations" : {
    "script" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "Jim",
          "doc_count" : 2
        },
        {
          "key" : "Kathy",
          "doc_count" : 2
        },
        {
          "key" : "Sam",
          "doc_count" : 1
        },
        {
          "key" : "Sarah",
          "doc_count" : 1
        }
      ]
    }
  }

但是,如果它是嵌套的,事情会变得更加复杂,因为您需要一些客户端工作来计算最终计数,但我们可以通过一些聚合来简化客户端工作:

POST nested/_search 
{
  "size": 0,
  "aggs": {
    "bunsWho": {
      "terms": {
        "field": "bunsResource.image.who.keyword"
      }
    },
    "bunsWhoNested": {
      "nested": {
        "path": "bunsResource.buns"
      },
      "aggs": {
        "who": {
          "terms": {
            "field": "bunsResource.buns.image.who.keyword"
          }
        }
      }
    },
    "pattiesWho": {
      "terms": {
        "field": "pattyResource.image.who.keyword"
      }
    },
    "pattiesWhoNested": {
      "nested": {
        "path": "pattyResource.patties"
      },
      "aggs": {
        "who": {
          "terms": {
            "field": "pattyResource.patties.image.who.keyword"
          }
        }
      }
    }
  }
}

那将 return 这个:

  "aggregations" : {
    "pattiesWho" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "Kathy",
          "doc_count" : 2
        }
      ]
    },
    "bunsWhoNested" : {
      "doc_count" : 4,
      "who" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 0,
        "buckets" : [
          {
            "key" : "Jim",
            "doc_count" : 2
          },
          {
            "key" : "Kathy",
            "doc_count" : 1
          },
          {
            "key" : "Sarah",
            "doc_count" : 1
          }
        ]
      }
    },
    "pattiesWhoNested" : {
      "doc_count" : 2,
      "who" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 0,
        "buckets" : [
          {
            "key" : "Kathy",
            "doc_count" : 2
          }
        ]
      }
    },
    "bunsWho" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "Jim",
          "doc_count" : 1
        },
        {
          "key" : "Sam",
          "doc_count" : 1
        }
      ]
    }
  }

然后您可以简单地创建一些客户端逻辑(这里是 Node.js 中的一些示例代码),将数字相加:

var whos = {};
var recordWho = function(who, count) {
    whos[who] = (whos[who] || 0) + count;
};

resp.aggregations.pattiesWho.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.pattiesWhoNested.who.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.bunsWho.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});
resp.aggregations.bunsWhoNested.who.buckets.forEach(function(b) {recordWho(b.key, b.doc_count)});

console.log(whos);

=>

{ Kathy: 5, Jim: 3, Sam: 1, Sarah: 1 }