查找重复项时指定 Elasticsearch 聚合字段
Specify Elasticsearch aggregation fields when finding duplicates
我在查找重复项时使用以下 ES 查询:
"aggs": {
"duplicates": {
"terms": {
"field": "phone",
"min_doc_count": 2,
"size": 99999,
"order": {
"_term": "asc"
}
},
"aggs": {
"_docs": {
"top_hits": {
"size": 99999
}
}
}
}
}
它运行良好,它 returns 键,在本例中是 phone
,并且在它里面 returns 所有匹配项。主要问题恰恰是,在 _source 上它带来了一切,这是我案例中的很多字段,我想指定只带来我需要的那些。返回内容示例:
"duplicates": {
"1": {
"key": "1",
"doc_count": 2,
"_docs": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "local:company_id:1:sync",
"_type": "leads",
"_id": "23",
"_score": 1,
"_source": {
"id": 23,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": "", // .... and so on
我想指定将在 _source
上返回的字段,可以吗?
我遇到的另一个问题是我想按特定字段(按 id
)对聚合结果进行排序,但如果我输入任何字段名称而不是 _term
,它会给我一个错误。
谢谢!
在下面的示例中,id
29
和 23
的文档具有相同的 phone
,因此这些是重复的。搜索查询将只显示两个字段,即 id
和 phone
(您可以根据您的条件更改这些字段)并根据 id
[=20= 对热门结果进行排序]
添加具有索引数据、搜索查询和搜索结果的工作示例
索引数据:
{
"id": 29,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": ""
}
{
"id": 23,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": ""
}
{
"id": 30,
"phone": 1235,
"areacode_id": 92,
"areacode_state_id": 10,
"firstName": "Mark",
"lastName": "Smith",
"state": ""
}
搜索查询:
{
"size": 0,
"aggs": {
"duplicates": {
"terms": {
"field": "phone",
"min_doc_count": 2,
"size": 99999
},
"aggs": {
"_docs": {
"top_hits": {
"_source": {
"includes": [
"phone",
"id"
]
},
"sort": [
{
"id": {
"order": "asc"
}
}
]
}
}
}
}
}
}
搜索结果:
"aggregations": {
"duplicates": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123456,
"doc_count": 2,
"_docs": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "66896259",
"_type": "_doc",
"_id": "1",
"_score": null,
"_source": {
"phone": 123456,
"id": 23
},
"sort": [
23 // note this
]
},
{
"_index": "66896259",
"_type": "_doc",
"_id": "2",
"_score": null,
"_source": {
"phone": 123456,
"id": 29
},
"sort": [
29 // note this
]
}
]
}
}
}
]
}
}
我在查找重复项时使用以下 ES 查询:
"aggs": {
"duplicates": {
"terms": {
"field": "phone",
"min_doc_count": 2,
"size": 99999,
"order": {
"_term": "asc"
}
},
"aggs": {
"_docs": {
"top_hits": {
"size": 99999
}
}
}
}
}
它运行良好,它 returns 键,在本例中是 phone
,并且在它里面 returns 所有匹配项。主要问题恰恰是,在 _source 上它带来了一切,这是我案例中的很多字段,我想指定只带来我需要的那些。返回内容示例:
"duplicates": {
"1": {
"key": "1",
"doc_count": 2,
"_docs": {
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "local:company_id:1:sync",
"_type": "leads",
"_id": "23",
"_score": 1,
"_source": {
"id": 23,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": "", // .... and so on
我想指定将在 _source
上返回的字段,可以吗?
我遇到的另一个问题是我想按特定字段(按 id
)对聚合结果进行排序,但如果我输入任何字段名称而不是 _term
,它会给我一个错误。
谢谢!
在下面的示例中,id
29
和 23
的文档具有相同的 phone
,因此这些是重复的。搜索查询将只显示两个字段,即 id
和 phone
(您可以根据您的条件更改这些字段)并根据 id
[=20= 对热门结果进行排序]
添加具有索引数据、搜索查询和搜索结果的工作示例
索引数据:
{
"id": 29,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": ""
}
{
"id": 23,
"phone": 123456,
"areacode_id": 426,
"areacode_state_id": 2,
"firstName": "Brayan",
"lastName": "Rastelli",
"state": ""
}
{
"id": 30,
"phone": 1235,
"areacode_id": 92,
"areacode_state_id": 10,
"firstName": "Mark",
"lastName": "Smith",
"state": ""
}
搜索查询:
{
"size": 0,
"aggs": {
"duplicates": {
"terms": {
"field": "phone",
"min_doc_count": 2,
"size": 99999
},
"aggs": {
"_docs": {
"top_hits": {
"_source": {
"includes": [
"phone",
"id"
]
},
"sort": [
{
"id": {
"order": "asc"
}
}
]
}
}
}
}
}
}
搜索结果:
"aggregations": {
"duplicates": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 123456,
"doc_count": 2,
"_docs": {
"hits": {
"total": {
"value": 2,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "66896259",
"_type": "_doc",
"_id": "1",
"_score": null,
"_source": {
"phone": 123456,
"id": 23
},
"sort": [
23 // note this
]
},
{
"_index": "66896259",
"_type": "_doc",
"_id": "2",
"_score": null,
"_source": {
"phone": 123456,
"id": 29
},
"sort": [
29 // note this
]
}
]
}
}
}
]
}
}