Elasticsearch 排序脚本仅对少数文档无法按预期工作
Elasticsearch sort script not working as expected for few documents only
考虑这样一个查询:
{
"size": 200,
"query": {
"bool" : {
....
}
},
"sort": {
"_script" : {
"script" : {
"source" : "params._source.participants[0].participantEmail",
"lang" : "painless"
},
"type" : "string",
"order" : "desc"
}
}
}
此查询几乎适用于所有文档,因为其中一些文档不在正确的位置。怎么可能?
最后一个文档的顺序是这样的(我显示的是每个文档的参与者数组的第一项):
shiend@....
denys@...
Lynn@...
怎么可能?我没有方向。排序查询有误吗?
设置:
"myindex" : {
"settings" : {
"index" : {
"refresh_interval" : "30s",
"number_of_shards" : "5",
"provided_name" : "myindex",
"creation_date" : "1600703588497",
"analysis" : {
"filter" : {
"english_keywords" : {
"keywords" : [
"example"
],
"type" : "keyword_marker"
},
"english_stemmer" : {
"type" : "stemmer",
"language" : "english"
},
"synonym" : {
"type" : "synonym",
"synonyms_path" : "analysis/UK_US_Sync_2.csv",
"updateable" : "true"
},
"english_possessive_stemmer" : {
"type" : "stemmer",
"language" : "possessive_english"
},
"english_stop" : {
"type" : "stop",
"stopwords" : "_english_"
},
"my_katakana_stemmer" : {
"type" : "kuromoji_stemmer",
"minimum_length" : "4"
}
},
"normalizer" : {
"custom_normalizer" : {
"filter" : [
"lowercase",
"asciifolding"
],
"type" : "custom",
"char_filter" : [ ]
}
},
"analyzer" : {
"somevar_english" : {
"filter" : [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer",
"asciifolding",
"synonym"
],
"tokenizer" : "standard"
},
"myvar_chinese" : {
"filter" : [
"porter_stem"
],
"tokenizer" : "smartcn_tokenizer"
},
"myvar" : {
"filter" : [
"my_katakana_stemmer"
],
"tokenizer" : "kuromoji_tokenizer"
}
}
},
"number_of_replicas" : "1",
"uuid" : "d0LlBVqIQGSk4afEWFD",
"version" : {
"created" : "6081099",
"upgraded" : "6081299"
}
}
}
}
映射:
{
"myindex": {
"mappings": {
"doc": {
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"properties": {
"all_fields": {
"type": "text"
},
"participants": {
"type": "nested",
"include_in_root": true,
"properties": {
"participantEmail": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "custom_normalizer"
}
},
"copy_to": [
"all_fields"
]
},
"participantType": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "custom_normalizer"
}
},
"copy_to": [
"all_fields"
]
}
}
}
}
}
}
}
}
编辑:可能是因为电子邮件 Lynn@.. 以大写字母开头?
确实,字符串是按词法顺序排序的,即大写字母排在小写字母之前(降序顺序相反)
您可以将脚本中的所有电子邮件小写:
"sort": {
"_script" : {
"script" : {
"source" : "params._source.participants[0].participantEmail.toLowerCase()",
"lang" : "painless"
},
"type" : "string",
"order" : "desc"
}
}
考虑这样一个查询:
{
"size": 200,
"query": {
"bool" : {
....
}
},
"sort": {
"_script" : {
"script" : {
"source" : "params._source.participants[0].participantEmail",
"lang" : "painless"
},
"type" : "string",
"order" : "desc"
}
}
}
此查询几乎适用于所有文档,因为其中一些文档不在正确的位置。怎么可能?
最后一个文档的顺序是这样的(我显示的是每个文档的参与者数组的第一项):
shiend@....
denys@...
Lynn@...
怎么可能?我没有方向。排序查询有误吗?
设置:
"myindex" : {
"settings" : {
"index" : {
"refresh_interval" : "30s",
"number_of_shards" : "5",
"provided_name" : "myindex",
"creation_date" : "1600703588497",
"analysis" : {
"filter" : {
"english_keywords" : {
"keywords" : [
"example"
],
"type" : "keyword_marker"
},
"english_stemmer" : {
"type" : "stemmer",
"language" : "english"
},
"synonym" : {
"type" : "synonym",
"synonyms_path" : "analysis/UK_US_Sync_2.csv",
"updateable" : "true"
},
"english_possessive_stemmer" : {
"type" : "stemmer",
"language" : "possessive_english"
},
"english_stop" : {
"type" : "stop",
"stopwords" : "_english_"
},
"my_katakana_stemmer" : {
"type" : "kuromoji_stemmer",
"minimum_length" : "4"
}
},
"normalizer" : {
"custom_normalizer" : {
"filter" : [
"lowercase",
"asciifolding"
],
"type" : "custom",
"char_filter" : [ ]
}
},
"analyzer" : {
"somevar_english" : {
"filter" : [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer",
"asciifolding",
"synonym"
],
"tokenizer" : "standard"
},
"myvar_chinese" : {
"filter" : [
"porter_stem"
],
"tokenizer" : "smartcn_tokenizer"
},
"myvar" : {
"filter" : [
"my_katakana_stemmer"
],
"tokenizer" : "kuromoji_tokenizer"
}
}
},
"number_of_replicas" : "1",
"uuid" : "d0LlBVqIQGSk4afEWFD",
"version" : {
"created" : "6081099",
"upgraded" : "6081299"
}
}
}
}
映射:
{
"myindex": {
"mappings": {
"doc": {
"dynamic_date_formats": [
"yyyy-MM-dd HH:mm:ss.SSS"
],
"properties": {
"all_fields": {
"type": "text"
},
"participants": {
"type": "nested",
"include_in_root": true,
"properties": {
"participantEmail": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "custom_normalizer"
}
},
"copy_to": [
"all_fields"
]
},
"participantType": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
"normalizer": "custom_normalizer"
}
},
"copy_to": [
"all_fields"
]
}
}
}
}
}
}
}
}
编辑:可能是因为电子邮件 Lynn@.. 以大写字母开头?
确实,字符串是按词法顺序排序的,即大写字母排在小写字母之前(降序顺序相反)
您可以将脚本中的所有电子邮件小写:
"sort": {
"_script" : {
"script" : {
"source" : "params._source.participants[0].participantEmail.toLowerCase()",
"lang" : "painless"
},
"type" : "string",
"order" : "desc"
}
}