ElasticSearch 过滤聚合而不影响聚合计数

Question

我们使用 ElasticSearch 根据 5 个字段查找报价，例如一些 'free text'、报价状态和客户名称。我们还需要聚合客户名称和报价状态这两个字段。因此，当有人输入一些自由文本时，我们发现有 10 个状态为关闭的文档和 8 个状态为打开的文档时，'state filter' 应该包含 closed(10) 和 open(8)。

现在的问题是，当我select将状态'closed'包含在过滤器中时，打开的聚合结果变为0。我希望它保持8。那么如何才能我防止聚合上的过滤器影响聚合本身？

这是第一个查询，搜索 'java':

{
    "query": {
        "bool": {
            "filter": [
            ],
            "must": {
                "simple_query_string": {
                    "query" : "java"
                }
            }
        }
    },
    "aggs": {
        "OFFER_STATE_F": {
            "terms": {
                "size": 0,
                "field": "offer_state_f",
                "min_doc_count": 0
            }
        }
    },
    "from": 0,
    "size": 1,
    "fields": ["offer_id_ft", "offer_state_f"]
}

结果是这样的：

{
  "hits": {
    "total": 960,
    "max_score": 0.89408284000000005,
    "hits": [
      {
        "_type": "offer",
        "_index": "select",
        "_id": "40542",
        "fields": {
          "offer_id_ft": [
            "40542"
          ],
          "offer_state_f": [
            "REJECTED"
          ]
        },
        "_score": 0.89408284000000005
      }
    ]
  },
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "timed_out": false,
  "aggregations": {
    "OFFER_STATE_F": {
      "buckets": [
        {
          "key": "REJECTED",
          "doc_count": 778
        },
        {
          "key": "ACCEPTED",
          "doc_count": 130
        },
        {
          "key": "CANCELED",
          "doc_count": 22
        },
        {
          "key": "WITHDRAWN",
          "doc_count": 13
        },
        {
          "key": "LONGLIST",
          "doc_count": 12
        },
        {
          "key": "SHORTLIST",
          "doc_count": 5
        },
        {
          "key": "INTAKE",
          "doc_count": 0
        }
      ],
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0
    }
  },
  "took": 2
}

如您所见，client_state_f 个桶的总和等于总命中数 (960)。现在，我在查询中包含其中一个状态，比如 'ACCEPTED'。所以我的查询变成：

{
    "query": {
        "bool": {
            "filter": [
                {
                    "bool": {
                        "should": [
                            {
                                "term": {
                                    "offer_state_f": "ACCEPTED"
                                }
                            }
                        ]
                    }
                }            
            ],
            "must": {
                "simple_query_string": {
                    "query" : "java"
                }
            }
        }
    },
    "aggs": {
        "OFFER_STATE_F": {
            "terms": {
                "size": 0,
                "field": "offer_state_f",
                "min_doc_count": 0
            }
        }
    },
    "from": 0,
    "size": 1,
    "fields": ["offer_id_ft", "offer_state_f"]
}

我想要的是 130 个结果，但是 client_state_f 个桶总和仍然是 960 个。但是我得到的是：

{
  "hits": {
    "total": 130,
    "max_score": 0.89408284000000005,
    "hits": [
      {
        "_type": "offer",
        "_index": "select",
        "_id": "16884",
        "fields": {
          "offer_id_ft": [
            "16884"
          ],
          "offer_state_f": [
            "ACCEPTED"
          ]
        },
        "_score": 0.89408284000000005
      }
    ]
  },
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "timed_out": false,
  "aggregations": {
    "OFFER_STATE_F": {
      "buckets": [
        {
          "key": "ACCEPTED",
          "doc_count": 130
        },
        {
          "key": "CANCELED",
          "doc_count": 0
        },
        {
          "key": "INTAKE",
          "doc_count": 0
        },
        {
          "key": "LONGLIST",
          "doc_count": 0
        },
        {
          "key": "REJECTED",
          "doc_count": 0
        },
        {
          "key": "SHORTLIST",
          "doc_count": 0
        },
        {
          "key": "WITHDRAWN",
          "doc_count": 0
        }
      ],
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0
    }
  },
  "took": 10
}

可以看到，只有ACCEPTED桶满了，其他的都是0。

Answer 1

您需要将过滤器移至 post_filter 部分而不是 query 部分。

这样，将在计算聚合后应用过滤，您将能够聚合整个数据集，但只会获得与您的过滤器匹配的结果。

Answer 2

好的，我在同事的帮助下找到了答案，事实是，Val i 是对的。为他+1。我所做的是将我所有的查询过滤器放在 post_filter 中，这就是问题所在。我只需要在 post_filter 中放置我想要聚合的字段的过滤器。因此：

{
    "query": {
        "bool": {
            "filter": [
            {
                "term": {
                    "broker_f": "false"
                }
            }
            ],
            "must": {
                "simple_query_string": {
                    "query" : "java"
                }
            }
        }
    },
    "aggs": {
        "OFFER_STATE_F": {
            "terms": {
                "size": 0,
                "field": "offer_state_f",
                "min_doc_count": 0
            }
        }
    },
    "post_filter" : {
        "bool": {
            "should": [
                {
                    "term": {
                        "offer_state_f": "SHORTLIST"
                    }
                }
            ]
        }
    },
    "from": 0,
    "size": 1,
    "fields": ["offer_id_ft", "offer_state_f"]
}

现在结果是正确的：

{
  "hits": {
    "total": 5,
    "max_score": 0.76667790000000002,
    "hits": [
      {
        "_type": "offer",
        "_index": "select",
        "_id": "24454",
        "fields": {
          "offer_id_ft": [
            "24454"
          ],
          "offer_state_f": [
            "SHORTLIST"
          ]
        },
        "_score": 0.76667790000000002
      }
    ]
  },
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "timed_out": false,
  "aggregations": {
    "OFFER_STATE_F": {
      "buckets": [
        {
          "key": "REJECTED",
          "doc_count": 777
        },
        {
          "key": "ACCEPTED",
          "doc_count": 52
        },
        {
          "key": "CANCELED",
          "doc_count": 22
        },
        {
          "key": "LONGLIST",
          "doc_count": 12
        },
        {
          "key": "WITHDRAWN",
          "doc_count": 12
        },
        {
          "key": "SHORTLIST",
          "doc_count": 5
        },
        {
          "key": "INTAKE",
          "doc_count": 0
        }
      ],
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0
    }
  },
  "took": 4
}

ElasticSearch 过滤聚合而不影响聚合计数

ElasticSearch filter on aggregations without affecting aggregation counts

filter

aggregation

elasticsearch