Elastic Search 中的聚合和排序
Aggregation and Sorting in Elastic Search
我想在弹性搜索中对查询的聚合结果进行排序
等效 SQL 查询:- select col1, col2, sum(col3) from table group by col1,col2 order by sum(col3) desc;
我尝试了以下查询,它 returns 结果但不是我期望的排序顺序
{
"from": 0,
"size": 0,
"_source": {
"includes": [
"col1",
"col2",
"SUM"
],
"excludes": []
},
"stored_fields": [
"col1",
"col2"
],
"aggregations": {
"col1": {
"terms": {
"field": "col1",
"size": 200,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false
},
"aggregations": {
"col2": {
"terms": {
"field": "col2",
"size": 10,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false
},
"aggregations": {
"SUM_0": {
"sum": {
"field": "col3"
}
},
"col3_bucket_sort": {
"bucket_sort": {
"sort": [
{ "SUM_0": { "order": "desc" } }
],
"size": 3
}
}
}
}
}
}
}
}
采样索引数据
{
"_index": "my_index",
"_type": "products",
"_id": "OJfBSXUB0GzAt2o_zVdS",
"_score": 1.0,
"_source": {
"product_name": "car",
"product_type": "retail",
"qty": 5
}
}
{
"_index": "my_index",
"_type": "report",
"_id": "OpfBSXUB0GzAt2o_zVfG1",
"_score": 1.0,
"_source": {
"product_name": "bike",
"product_type": "retail",
"qty": 5
}
},
{
"_index": "my_index",
"_type": "report",
"_id": "OpfBSXUB0GzAt2o_zVfG",
"_score": 1.0,
"_source": {
"product_name": "car",
"product_type": "retail",
"qty": 3
}
},
{
"_index": "my_index",
"_type": "report",
"_id": "OpfBSXUB0GzAt2o_zVfG2",
"_score": 1.0,
"_source": {
"product_name": "bike",
"product_type": "retail",
"qty": 1
}
}
预期输出:- 想要根据字段 product_name 和 product_type 聚合(分组依据)我的文档并按总和(数量)
排序
等效 SQl 查询:- select product_name、product_type、来自 product_table 的总和(数量)按 product_name、product_type order by sum(qty) desc;
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"product_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "car",
"doc_count": 2,
"product_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": retail,
"doc_count": 2,
"SUM_0": {
"value":8
}
}
]
}
},
{
"key": "bike",
"doc_count": 2,
"product_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": retail,
"doc_count": 2,
"SUM_0": {
"value": 6
}
}
]
}
}
]
}
}
}
但我的输出低于输出,即成功聚合了文档,但排序对 sum(qty) 不起作用
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"product_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bike",
"doc_count": 2,
"product_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": retail,
"doc_count": 2,
"SUM_0": {
"value": 6
}
}
]
}
},
{
"key": "car",
"doc_count": 2,
"product_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": retail,
"doc_count": 2,
"SUM_0": {
"value":8
}
}
]
}
}
]
}
}
}
指摘自 ES-Docs
The bucket_sort aggregation, like all pipeline aggregations, is
executed after all other non-pipeline aggregations. This means the
sorting only applies to whatever buckets are already returned from the
parent aggregation. For example, if the parent aggregation is terms
and its size is set to 10, the bucket_sort will only sort over those
10 returned term buckets.
以上是您的查询未给出正确结果的原因。
由于您按 col1、col2 对数据进行分组(即使用两项聚合),因此当您尝试使用桶排序聚合基于总和聚合对结果进行排序时,结果并不合适。
您需要使用 max bucket aggregation,这是一个兄弟管道聚合,它标识具有兄弟聚合中指定指标最大值的桶,并输出值和键桶的数量。
然后您应该对聚合结果执行 bucket sort aggregation。
添加一个工作示例,其中包含索引数据(与问题中使用的相同)、搜索查询和搜索结果。
搜索查询:
{
"size": 0,
"aggs": {
"agg1": {
"terms": {
"field": "product_name.keyword"
},
"aggs": {
"agg2": {
"terms": {
"field": "product_type.keyword"
},
"aggregations": {
"SUM_0": {
"sum": {
"field": "qty"
}
}
}
},
"sum_max_bucket": {
"max_bucket": {
"buckets_path": "agg2>SUM_0" <-- note this
}
},
"sum_bucket_sort": {
"bucket_sort": {
"sort": {
"sum_max_bucket": {
"order": "desc"
}
}
}
}
}
}
}
}
搜索结果:
"aggregations": {
"agg1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "car",
"doc_count": 2,
"agg2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "retail",
"doc_count": 2,
"SUM_0": {
"value": 8.0 <-- note this
}
}
]
},
"sum_max_bucket": {
"value": 8.0,
"keys": [
"retail"
]
}
},
{
"key": "bike",
"doc_count": 2,
"agg2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "retail",
"doc_count": 2,
"SUM_0": {
"value": 6.0 <-- note this
}
}
]
},
"sum_max_bucket": {
"value": 6.0,
"keys": [
"retail"
]
}
}
]
}
我想在弹性搜索中对查询的聚合结果进行排序
等效 SQL 查询:- select col1, col2, sum(col3) from table group by col1,col2 order by sum(col3) desc;
我尝试了以下查询,它 returns 结果但不是我期望的排序顺序
{
"from": 0,
"size": 0,
"_source": {
"includes": [
"col1",
"col2",
"SUM"
],
"excludes": []
},
"stored_fields": [
"col1",
"col2"
],
"aggregations": {
"col1": {
"terms": {
"field": "col1",
"size": 200,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false
},
"aggregations": {
"col2": {
"terms": {
"field": "col2",
"size": 10,
"min_doc_count": 1,
"shard_min_doc_count": 0,
"show_term_doc_count_error": false
},
"aggregations": {
"SUM_0": {
"sum": {
"field": "col3"
}
},
"col3_bucket_sort": {
"bucket_sort": {
"sort": [
{ "SUM_0": { "order": "desc" } }
],
"size": 3
}
}
}
}
}
}
}
}
采样索引数据
{
"_index": "my_index",
"_type": "products",
"_id": "OJfBSXUB0GzAt2o_zVdS",
"_score": 1.0,
"_source": {
"product_name": "car",
"product_type": "retail",
"qty": 5
}
}
{
"_index": "my_index",
"_type": "report",
"_id": "OpfBSXUB0GzAt2o_zVfG1",
"_score": 1.0,
"_source": {
"product_name": "bike",
"product_type": "retail",
"qty": 5
}
},
{
"_index": "my_index",
"_type": "report",
"_id": "OpfBSXUB0GzAt2o_zVfG",
"_score": 1.0,
"_source": {
"product_name": "car",
"product_type": "retail",
"qty": 3
}
},
{
"_index": "my_index",
"_type": "report",
"_id": "OpfBSXUB0GzAt2o_zVfG2",
"_score": 1.0,
"_source": {
"product_name": "bike",
"product_type": "retail",
"qty": 1
}
}
预期输出:- 想要根据字段 product_name 和 product_type 聚合(分组依据)我的文档并按总和(数量)
排序等效 SQl 查询:- select product_name、product_type、来自 product_table 的总和(数量)按 product_name、product_type order by sum(qty) desc;
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"product_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "car",
"doc_count": 2,
"product_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": retail,
"doc_count": 2,
"SUM_0": {
"value":8
}
}
]
}
},
{
"key": "bike",
"doc_count": 2,
"product_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": retail,
"doc_count": 2,
"SUM_0": {
"value": 6
}
}
]
}
}
]
}
}
}
但我的输出低于输出,即成功聚合了文档,但排序对 sum(qty) 不起作用
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"product_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bike",
"doc_count": 2,
"product_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": retail,
"doc_count": 2,
"SUM_0": {
"value": 6
}
}
]
}
},
{
"key": "car",
"doc_count": 2,
"product_type": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": retail,
"doc_count": 2,
"SUM_0": {
"value":8
}
}
]
}
}
]
}
}
}
指摘自 ES-Docs
The bucket_sort aggregation, like all pipeline aggregations, is executed after all other non-pipeline aggregations. This means the sorting only applies to whatever buckets are already returned from the parent aggregation. For example, if the parent aggregation is terms and its size is set to 10, the bucket_sort will only sort over those 10 returned term buckets.
以上是您的查询未给出正确结果的原因。
由于您按 col1、col2 对数据进行分组(即使用两项聚合),因此当您尝试使用桶排序聚合基于总和聚合对结果进行排序时,结果并不合适。
您需要使用 max bucket aggregation,这是一个兄弟管道聚合,它标识具有兄弟聚合中指定指标最大值的桶,并输出值和键桶的数量。
然后您应该对聚合结果执行 bucket sort aggregation。
添加一个工作示例,其中包含索引数据(与问题中使用的相同)、搜索查询和搜索结果。
搜索查询:
{
"size": 0,
"aggs": {
"agg1": {
"terms": {
"field": "product_name.keyword"
},
"aggs": {
"agg2": {
"terms": {
"field": "product_type.keyword"
},
"aggregations": {
"SUM_0": {
"sum": {
"field": "qty"
}
}
}
},
"sum_max_bucket": {
"max_bucket": {
"buckets_path": "agg2>SUM_0" <-- note this
}
},
"sum_bucket_sort": {
"bucket_sort": {
"sort": {
"sum_max_bucket": {
"order": "desc"
}
}
}
}
}
}
}
}
搜索结果:
"aggregations": {
"agg1": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "car",
"doc_count": 2,
"agg2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "retail",
"doc_count": 2,
"SUM_0": {
"value": 8.0 <-- note this
}
}
]
},
"sum_max_bucket": {
"value": 8.0,
"keys": [
"retail"
]
}
},
{
"key": "bike",
"doc_count": 2,
"agg2": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "retail",
"doc_count": 2,
"SUM_0": {
"value": 6.0 <-- note this
}
}
]
},
"sum_max_bucket": {
"value": 6.0,
"keys": [
"retail"
]
}
}
]
}