ElasticSearch 过滤聚合而不影响聚合计数
ElasticSearch filter on aggregations without affecting aggregation counts
我们使用 ElasticSearch 根据 5 个字段查找报价,例如一些 'free text'、报价状态和客户名称。我们还需要聚合客户名称和报价状态这两个字段。因此,当有人输入一些自由文本时,我们发现有 10 个状态为关闭的文档和 8 个状态为打开的文档时,'state filter' 应该包含 closed(10) 和 open(8)。
现在的问题是,当我select将状态'closed'包含在过滤器中时,打开的聚合结果变为0。我希望它保持8。那么如何才能我防止聚合上的过滤器影响聚合本身?
这是第一个查询,搜索 'java':
{
"query": {
"bool": {
"filter": [
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
结果是这样的:
{
"hits": {
"total": 960,
"max_score": 0.89408284000000005,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "40542",
"fields": {
"offer_id_ft": [
"40542"
],
"offer_state_f": [
"REJECTED"
]
},
"_score": 0.89408284000000005
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "REJECTED",
"doc_count": 778
},
{
"key": "ACCEPTED",
"doc_count": 130
},
{
"key": "CANCELED",
"doc_count": 22
},
{
"key": "WITHDRAWN",
"doc_count": 13
},
{
"key": "LONGLIST",
"doc_count": 12
},
{
"key": "SHORTLIST",
"doc_count": 5
},
{
"key": "INTAKE",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 2
}
如您所见,client_state_f 个桶的总和等于总命中数 (960)。现在,我在查询中包含其中一个状态,比如 'ACCEPTED'。所以我的查询变成:
{
"query": {
"bool": {
"filter": [
{
"bool": {
"should": [
{
"term": {
"offer_state_f": "ACCEPTED"
}
}
]
}
}
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
我想要的是 130 个结果,但是 client_state_f 个桶总和仍然是 960 个。但是我得到的是:
{
"hits": {
"total": 130,
"max_score": 0.89408284000000005,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "16884",
"fields": {
"offer_id_ft": [
"16884"
],
"offer_state_f": [
"ACCEPTED"
]
},
"_score": 0.89408284000000005
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "ACCEPTED",
"doc_count": 130
},
{
"key": "CANCELED",
"doc_count": 0
},
{
"key": "INTAKE",
"doc_count": 0
},
{
"key": "LONGLIST",
"doc_count": 0
},
{
"key": "REJECTED",
"doc_count": 0
},
{
"key": "SHORTLIST",
"doc_count": 0
},
{
"key": "WITHDRAWN",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 10
}
可以看到,只有ACCEPTED桶满了,其他的都是0。
您需要将过滤器移至 post_filter
部分而不是 query
部分。
这样,将在计算聚合后应用过滤,您将能够聚合整个数据集,但只会获得与您的过滤器匹配的结果。
好的,我在同事的帮助下找到了答案,事实是,Val i 是对的。为他+1。我所做的是将我所有的查询过滤器放在 post_filter 中,这就是问题所在。我只需要在 post_filter 中放置我想要聚合的字段的过滤器。因此:
{
"query": {
"bool": {
"filter": [
{
"term": {
"broker_f": "false"
}
}
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"post_filter" : {
"bool": {
"should": [
{
"term": {
"offer_state_f": "SHORTLIST"
}
}
]
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
现在结果是正确的:
{
"hits": {
"total": 5,
"max_score": 0.76667790000000002,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "24454",
"fields": {
"offer_id_ft": [
"24454"
],
"offer_state_f": [
"SHORTLIST"
]
},
"_score": 0.76667790000000002
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "REJECTED",
"doc_count": 777
},
{
"key": "ACCEPTED",
"doc_count": 52
},
{
"key": "CANCELED",
"doc_count": 22
},
{
"key": "LONGLIST",
"doc_count": 12
},
{
"key": "WITHDRAWN",
"doc_count": 12
},
{
"key": "SHORTLIST",
"doc_count": 5
},
{
"key": "INTAKE",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 4
}
我们使用 ElasticSearch 根据 5 个字段查找报价,例如一些 'free text'、报价状态和客户名称。我们还需要聚合客户名称和报价状态这两个字段。因此,当有人输入一些自由文本时,我们发现有 10 个状态为关闭的文档和 8 个状态为打开的文档时,'state filter' 应该包含 closed(10) 和 open(8)。
现在的问题是,当我select将状态'closed'包含在过滤器中时,打开的聚合结果变为0。我希望它保持8。那么如何才能我防止聚合上的过滤器影响聚合本身?
这是第一个查询,搜索 'java':
{
"query": {
"bool": {
"filter": [
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
结果是这样的:
{
"hits": {
"total": 960,
"max_score": 0.89408284000000005,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "40542",
"fields": {
"offer_id_ft": [
"40542"
],
"offer_state_f": [
"REJECTED"
]
},
"_score": 0.89408284000000005
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "REJECTED",
"doc_count": 778
},
{
"key": "ACCEPTED",
"doc_count": 130
},
{
"key": "CANCELED",
"doc_count": 22
},
{
"key": "WITHDRAWN",
"doc_count": 13
},
{
"key": "LONGLIST",
"doc_count": 12
},
{
"key": "SHORTLIST",
"doc_count": 5
},
{
"key": "INTAKE",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 2
}
如您所见,client_state_f 个桶的总和等于总命中数 (960)。现在,我在查询中包含其中一个状态,比如 'ACCEPTED'。所以我的查询变成:
{
"query": {
"bool": {
"filter": [
{
"bool": {
"should": [
{
"term": {
"offer_state_f": "ACCEPTED"
}
}
]
}
}
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
我想要的是 130 个结果,但是 client_state_f 个桶总和仍然是 960 个。但是我得到的是:
{
"hits": {
"total": 130,
"max_score": 0.89408284000000005,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "16884",
"fields": {
"offer_id_ft": [
"16884"
],
"offer_state_f": [
"ACCEPTED"
]
},
"_score": 0.89408284000000005
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "ACCEPTED",
"doc_count": 130
},
{
"key": "CANCELED",
"doc_count": 0
},
{
"key": "INTAKE",
"doc_count": 0
},
{
"key": "LONGLIST",
"doc_count": 0
},
{
"key": "REJECTED",
"doc_count": 0
},
{
"key": "SHORTLIST",
"doc_count": 0
},
{
"key": "WITHDRAWN",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 10
}
可以看到,只有ACCEPTED桶满了,其他的都是0。
您需要将过滤器移至 post_filter
部分而不是 query
部分。
这样,将在计算聚合后应用过滤,您将能够聚合整个数据集,但只会获得与您的过滤器匹配的结果。
好的,我在同事的帮助下找到了答案,事实是,Val i 是对的。为他+1。我所做的是将我所有的查询过滤器放在 post_filter 中,这就是问题所在。我只需要在 post_filter 中放置我想要聚合的字段的过滤器。因此:
{
"query": {
"bool": {
"filter": [
{
"term": {
"broker_f": "false"
}
}
],
"must": {
"simple_query_string": {
"query" : "java"
}
}
}
},
"aggs": {
"OFFER_STATE_F": {
"terms": {
"size": 0,
"field": "offer_state_f",
"min_doc_count": 0
}
}
},
"post_filter" : {
"bool": {
"should": [
{
"term": {
"offer_state_f": "SHORTLIST"
}
}
]
}
},
"from": 0,
"size": 1,
"fields": ["offer_id_ft", "offer_state_f"]
}
现在结果是正确的:
{
"hits": {
"total": 5,
"max_score": 0.76667790000000002,
"hits": [
{
"_type": "offer",
"_index": "select",
"_id": "24454",
"fields": {
"offer_id_ft": [
"24454"
],
"offer_state_f": [
"SHORTLIST"
]
},
"_score": 0.76667790000000002
}
]
},
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"timed_out": false,
"aggregations": {
"OFFER_STATE_F": {
"buckets": [
{
"key": "REJECTED",
"doc_count": 777
},
{
"key": "ACCEPTED",
"doc_count": 52
},
{
"key": "CANCELED",
"doc_count": 22
},
{
"key": "LONGLIST",
"doc_count": 12
},
{
"key": "WITHDRAWN",
"doc_count": 12
},
{
"key": "SHORTLIST",
"doc_count": 5
},
{
"key": "INTAKE",
"doc_count": 0
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
},
"took": 4
}