如何按字段过滤 top_hits 聚合

How to filter by field a top_hits aggregation

我在对 600M 文档的超大索引创建查询时遇到了一些问题。我接近解决它但是我被卡住了。

我的文件类型如下:

{
    "first_name" : "John",
    "last_name" : "Doe",
    "company_domain" : "google",
    "provider_a_id" : "1234",
    "provider_b_id" : "14"
}

我需要每个公司 return 2 个联系人,其中 provider_a_id 匹配我之前获得的 ID 列表。

我得到了这个聚合,它 return 每个公司有 2 个联系人:

{
  "size": 0,
  "aggs": {
    "COMPANIES": {
      "terms": {
        "field": "company_domain.keyword",
        "order": { "_key": "asc" }, 
        "size": 2
      },
      "aggs": {
        "EMPLOYEES": {
          "top_hits": {
            "size": 2
          }
        }
      }
    }
  }
}

这很好,因为我可以解决一个部分,但问题是我现在还需要使用 provider_a_id 来缩小搜索范围。需要做类似的事情:

        "EMPLOYEES": {
          "top_hits": {
            "size": 2
            // provider_a_id is in [.......] // list with 10K Ids
          }
        }

你知道我该如何解决这个问题吗?

您需要在 top_hits 之前使用过滤器聚合。 我已经过滤了单个值(术语),您可以使用术语聚合来过滤列表

映射

PUT testindex7/_mappings
{
  "properties": {
    "first_name" :{
      "type": "text"
    },
    "last_name" : {
      "type": "text"
    },
    "company_domain" :{
      "type": "text",
      "fields": {
         "keyword":{
           "type": "keyword"
         }  
      }
    },
    "provider_a_id" : {
      "type": "integer"
    },
    "provider_b_id" : {
      "type": "integer"
    }
  }
}

数据:

 [
      {
        "_index" : "testindex7",
        "_type" : "_doc",
        "_id" : "OvU4OG0BCNyxVsPT3Xtn",
        "_score" : 1.0,
        "_source" : {
          "first_name" : "a",
          "last_name" : "b",
          "company_domain" : "google",
          "provider_a_id" : "100",
          "provider_b_id" : "1"
        }
      },
      {
        "_index" : "testindex7",
        "_type" : "_doc",
        "_id" : "O_U5OG0BCNyxVsPTAHsD",
        "_score" : 1.0,
        "_source" : {
          "first_name" : "c",
          "last_name" : "d",
          "company_domain" : "google",
          "provider_a_id" : "101",
          "provider_b_id" : "2"
        }
      },
      {
        "_index" : "testindex7",
        "_type" : "_doc",
        "_id" : "PPU5OG0BCNyxVsPTJ3tZ",
        "_score" : 1.0,
        "_source" : {
          "first_name" : "e",
          "last_name" : "f",
          "company_domain" : "google",
          "provider_a_id" : "102",
          "provider_b_id" : "3"
        }
      }
    ]

查询:

GET testindex7/_search
{
  "size": 0,
  "aggs": {
    "COMPANIES": {
      "terms": {
        "field": "company_domain.keyword",
        "order": {
          "_key": "asc"
        },
        "size": 2
      },
      "aggs": {
        "EMPLOYEES": {
          "filter": { 
            "terms": {
              "provider_a_id": [100,101]
            }
          },
          "aggs": {
            "top_emps": {
              "top_hits": {
                "size": 2
              }
            }
          }
        }
      }
    }
  }
}

结果:

"aggregations" : {
    "COMPANIES" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "google",
          "doc_count" : 3,
          "EMPLOYEES" : {
            "doc_count" : 2,
            "top_emps" : {
              "hits" : {
                "total" : {
                  "value" : 2,
                  "relation" : "eq"
                },
                "max_score" : 1.0,
                "hits" : [
                  {
                    "_index" : "testindex7",
                    "_type" : "_doc",
                    "_id" : "OvU4OG0BCNyxVsPT3Xtn",
                    "_score" : 1.0,
                    "_source" : {
                      "first_name" : "a",
                      "last_name" : "b",
                      "company_domain" : "google",
                      "provider_a_id" : "100",
                      "provider_b_id" : "1"
                    }
                  },
                  {
                    "_index" : "testindex7",
                    "_type" : "_doc",
                    "_id" : "O_U5OG0BCNyxVsPTAHsD",
                    "_score" : 1.0,
                    "_source" : {
                      "first_name" : "c",
                      "last_name" : "d",
                      "company_domain" : "google",
                      "provider_a_id" : "101",
                      "provider_b_id" : "2"
                    }
                  }
                ]
              }
            }
          }
        }
      ]
    }
  }

使用 aggs 查询

"query":{
        "term":{
          "provider_a_id":"1234"        
        }
    },
"aggs": {
    "COMPANIES": {
      "terms": {
        "field": "company_domain.keyword",
        "order": { "_key": "asc" }, 
        "size": 2
      },
      "aggs": {
        "EMPLOYEES": {
          "top_hits": {
            "size": 2
          }
        }
      }
    }
  }