ElasticSearch - 带过滤器的自定义分析器 - 未应用过滤器
ElasticSearch - custom analyzer with filters - filters not applied
我有以下查询:
GET /nameofmyindex/_analyze
{
"text" : "Limousinetesting",
"explain": true,
"analyzer": "default"
}
这导致:
{
"detail" : {
"custom_analyzer" : true,
"charfilters" : [ ],
"tokenizer" : {
"name" : "standard",
"tokens" : [
{
"token" : "Limousinetesting",
"start_offset" : 0,
"end_offset" : 16,
"type" : "<ALPHANUM>",
"position" : 0,
"bytes" : "[4c 69 6d 6f 75 73 69 6e 65 74 65 73 74 69 6e 67]",
"positionLength" : 1,
"termFrequency" : 1
}
]
},
"tokenfilters" : [ ]
}
}
我的索引配置如下所示:
{
"nameofmyindex":{
"aliases":{
},
"mappings":{
"properties":{
"author":{
"type":"integer"
},
"body:value":{
"type":"text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":256
}
}
},
"changed":{
"type":"date",
"format":"epoch_second"
},
"created":{
"type":"date",
"format":"epoch_second"
},
"id":{
"type":"keyword"
},
"promote":{
"type":"boolean"
},
"search_api_language":{
"type":"keyword"
},
"sticky":{
"type":"boolean"
},
"title":{
"type":"text",
"boost":5.0,
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":256
}
}
},
"type":{
"type":"keyword"
}
}
},
"settings":{
"index":{
"number_of_shards":"1",
"provided_name":"nameofmyindex",
"creation_date":"1579792687839",
"analysis":{
"filter":{
"stop":{
"type":"stop",
"stopwords":[
"i",
"me",
"my",
"myself"
]
},
"synonym":{
"type":"synonym",
"lenient":"true",
"synonyms":[
"P-Card, P Card => P-Card",
"limousinetesting => limousine"
]
}
},
"analyzer":{
"default":{
"type":"custom",
"filters":[
"lowercase",
"stop",
"synonym"
],
"tokenizer":"standard"
}
}
},
"number_of_replicas":"1",
"uuid":"QTlVnyWVRLayEfPWTrcgdg",
"version":{
"created":"7050199"
}
}
}
}
}
如您所见,带有过滤器的默认分析器无效,'Limousinetesting' 词未收到其 'limousine' 同义词。
分析仪应该如何显示过滤器才有效?即使是最简单的过滤器,在这种情况下也不会发生小写。
问题出在您创建索引设置的语法中,我能够重现您的问题并修复它。问题是您在 JSON 数组中使用 filters
来定义所有过滤器,而它应该只是 filter
即使您可以按照说明在该数组中定义许多过滤器在 ES official example.
请在下面找到创建索引的正确格式:
{
"mappings": {
"properties": {
"author": {
"type": "integer"
},
"body:value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"changed": {
"type": "date",
"format": "epoch_second"
},
"created": {
"type": "date",
"format": "epoch_second"
},
"id": {
"type": "keyword"
},
"promote": {
"type": "boolean"
},
"search_api_language": {
"type": "keyword"
},
"sticky": {
"type": "boolean"
},
"title": {
"type": "text",
"boost": 5,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"type": {
"type": "keyword"
}
}
},
"settings": {
"index": {
"number_of_shards": "1",
"analysis": {
"filter": {
"stop": {
"type": "stop",
"stopwords": [
"i",
"me",
"my",
"myself"
]
},
"synonym": {
"type": "synonym",
"lenient": "true",
"synonyms": [
"P-Card, P Card => P-Card",
"limousinetesting => limousine"
]
}
},
"analyzer": {
"default": {
"type": "custom",
"filter": [ --> Notice the change in filters to filter
"lowercase",
"stop",
"synonym"
],
"tokenizer": "standard"
}
}
},
"number_of_replicas": "1"
}
}
}
现在,当我使用上述映射创建索引并使用您的文本点击分析 API 时,我得到了它的同义词标记 limousine
,如下面的输出所示。
{
"tokens": [
{
"token": "limousine",
"start_offset": 0,
"end_offset": 16,
"type": "SYNONYM",
"position": 0
}
]
}
我有以下查询:
GET /nameofmyindex/_analyze
{
"text" : "Limousinetesting",
"explain": true,
"analyzer": "default"
}
这导致:
{
"detail" : {
"custom_analyzer" : true,
"charfilters" : [ ],
"tokenizer" : {
"name" : "standard",
"tokens" : [
{
"token" : "Limousinetesting",
"start_offset" : 0,
"end_offset" : 16,
"type" : "<ALPHANUM>",
"position" : 0,
"bytes" : "[4c 69 6d 6f 75 73 69 6e 65 74 65 73 74 69 6e 67]",
"positionLength" : 1,
"termFrequency" : 1
}
]
},
"tokenfilters" : [ ]
}
}
我的索引配置如下所示:
{
"nameofmyindex":{
"aliases":{
},
"mappings":{
"properties":{
"author":{
"type":"integer"
},
"body:value":{
"type":"text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":256
}
}
},
"changed":{
"type":"date",
"format":"epoch_second"
},
"created":{
"type":"date",
"format":"epoch_second"
},
"id":{
"type":"keyword"
},
"promote":{
"type":"boolean"
},
"search_api_language":{
"type":"keyword"
},
"sticky":{
"type":"boolean"
},
"title":{
"type":"text",
"boost":5.0,
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":256
}
}
},
"type":{
"type":"keyword"
}
}
},
"settings":{
"index":{
"number_of_shards":"1",
"provided_name":"nameofmyindex",
"creation_date":"1579792687839",
"analysis":{
"filter":{
"stop":{
"type":"stop",
"stopwords":[
"i",
"me",
"my",
"myself"
]
},
"synonym":{
"type":"synonym",
"lenient":"true",
"synonyms":[
"P-Card, P Card => P-Card",
"limousinetesting => limousine"
]
}
},
"analyzer":{
"default":{
"type":"custom",
"filters":[
"lowercase",
"stop",
"synonym"
],
"tokenizer":"standard"
}
}
},
"number_of_replicas":"1",
"uuid":"QTlVnyWVRLayEfPWTrcgdg",
"version":{
"created":"7050199"
}
}
}
}
}
如您所见,带有过滤器的默认分析器无效,'Limousinetesting' 词未收到其 'limousine' 同义词。
分析仪应该如何显示过滤器才有效?即使是最简单的过滤器,在这种情况下也不会发生小写。
问题出在您创建索引设置的语法中,我能够重现您的问题并修复它。问题是您在 JSON 数组中使用 filters
来定义所有过滤器,而它应该只是 filter
即使您可以按照说明在该数组中定义许多过滤器在 ES official example.
请在下面找到创建索引的正确格式:
{
"mappings": {
"properties": {
"author": {
"type": "integer"
},
"body:value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"changed": {
"type": "date",
"format": "epoch_second"
},
"created": {
"type": "date",
"format": "epoch_second"
},
"id": {
"type": "keyword"
},
"promote": {
"type": "boolean"
},
"search_api_language": {
"type": "keyword"
},
"sticky": {
"type": "boolean"
},
"title": {
"type": "text",
"boost": 5,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"type": {
"type": "keyword"
}
}
},
"settings": {
"index": {
"number_of_shards": "1",
"analysis": {
"filter": {
"stop": {
"type": "stop",
"stopwords": [
"i",
"me",
"my",
"myself"
]
},
"synonym": {
"type": "synonym",
"lenient": "true",
"synonyms": [
"P-Card, P Card => P-Card",
"limousinetesting => limousine"
]
}
},
"analyzer": {
"default": {
"type": "custom",
"filter": [ --> Notice the change in filters to filter
"lowercase",
"stop",
"synonym"
],
"tokenizer": "standard"
}
}
},
"number_of_replicas": "1"
}
}
}
现在,当我使用上述映射创建索引并使用您的文本点击分析 API 时,我得到了它的同义词标记 limousine
,如下面的输出所示。
{
"tokens": [
{
"token": "limousine",
"start_offset": 0,
"end_offset": 16,
"type": "SYNONYM",
"position": 0
}
]
}