ElasticSearch - 带过滤器的自定义分析器 - 未应用过滤器

ElasticSearch - custom analyzer with filters - filters not applied

我有以下查询:

GET /nameofmyindex/_analyze
{
  "text" : "Limousinetesting",
  "explain": true,
  "analyzer": "default"
}

这导致:

{
  "detail" : {
    "custom_analyzer" : true,
    "charfilters" : [ ],
    "tokenizer" : {
      "name" : "standard",
      "tokens" : [
        {
          "token" : "Limousinetesting",
          "start_offset" : 0,
          "end_offset" : 16,
          "type" : "<ALPHANUM>",
          "position" : 0,
          "bytes" : "[4c 69 6d 6f 75 73 69 6e 65 74 65 73 74 69 6e 67]",
          "positionLength" : 1,
          "termFrequency" : 1
        }
      ]
    },
    "tokenfilters" : [ ]
  }
}

我的索引配置如下所示:

{
   "nameofmyindex":{
      "aliases":{

      },
      "mappings":{
         "properties":{
            "author":{
               "type":"integer"
            },
            "body:value":{
               "type":"text",
               "fields":{
                  "keyword":{
                     "type":"keyword",
                     "ignore_above":256
                  }
               }
            },
            "changed":{
               "type":"date",
               "format":"epoch_second"
            },
            "created":{
               "type":"date",
               "format":"epoch_second"
            },
            "id":{
               "type":"keyword"
            },
            "promote":{
               "type":"boolean"
            },
            "search_api_language":{
               "type":"keyword"
            },
            "sticky":{
               "type":"boolean"
            },
            "title":{
               "type":"text",
               "boost":5.0,
               "fields":{
                  "keyword":{
                     "type":"keyword",
                     "ignore_above":256
                  }
               }
            },
            "type":{
               "type":"keyword"
            }
         }
      },
      "settings":{
         "index":{
            "number_of_shards":"1",
            "provided_name":"nameofmyindex",
            "creation_date":"1579792687839",
            "analysis":{
               "filter":{
                  "stop":{
                     "type":"stop",
                     "stopwords":[
                        "i",
                        "me",
                        "my",
                        "myself"
                     ]
                  },
                  "synonym":{
                     "type":"synonym",
                     "lenient":"true",
                     "synonyms":[
                        "P-Card, P Card => P-Card",
                        "limousinetesting => limousine"
                     ]
                  }
               },
               "analyzer":{
                  "default":{
                     "type":"custom",
                     "filters":[
                        "lowercase",
                        "stop",
                        "synonym"
                     ],
                     "tokenizer":"standard"
                  }
               }
            },
            "number_of_replicas":"1",
            "uuid":"QTlVnyWVRLayEfPWTrcgdg",
            "version":{
               "created":"7050199"
            }
         }
      }
   }
}

如您所见,带有过滤器的默认分析器无效,'Limousinetesting' 词未收到其 'limousine' 同义词。

分析仪应该如何显示过滤器才有效?即使是最简单的过滤器,在这种情况下也不会发生小写。

问题出在您创建索引设置的语法中,我能够重现您的问题并修复它。问题是您在 JSON 数组中使用 filters 来定义所有过滤器,而它应该只是 filter 即使您可以按照说明在该数组中定义许多过滤器在 ES official example.

请在下面找到创建索引的正确格式:

{
    "mappings": {
        "properties": {
            "author": {
                "type": "integer"
            },
            "body:value": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "changed": {
                "type": "date",
                "format": "epoch_second"
            },
            "created": {
                "type": "date",
                "format": "epoch_second"
            },
            "id": {
                "type": "keyword"
            },
            "promote": {
                "type": "boolean"
            },
            "search_api_language": {
                "type": "keyword"
            },
            "sticky": {
                "type": "boolean"
            },
            "title": {
                "type": "text",
                "boost": 5,
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "type": {
                "type": "keyword"
            }
        }
    },
    "settings": {
        "index": {
            "number_of_shards": "1",
            "analysis": {
                "filter": {
                    "stop": {
                        "type": "stop",
                        "stopwords": [
                            "i",
                            "me",
                            "my",
                            "myself"
                        ]
                    },
                    "synonym": {
                        "type": "synonym",
                        "lenient": "true",
                        "synonyms": [
                            "P-Card, P Card => P-Card",
                            "limousinetesting => limousine"
                        ]
                    }
                },
                "analyzer": {
                    "default": {
                        "type": "custom",
                        "filter": [ --> Notice the change in filters to filter 
                            "lowercase",
                            "stop",
                            "synonym"
                        ],
                        "tokenizer": "standard"
                    }
                }
            },
            "number_of_replicas": "1"
        }
    }
}

现在,当我使用上述映射创建索引并使用您的文本点击分析 API 时,我得到了它的同义词标记 limousine,如下面的输出所示。

{
    "tokens": [
        {
            "token": "limousine",
            "start_offset": 0,
            "end_offset": 16,
            "type": "SYNONYM",
            "position": 0
        }
    ]
}