如何在 Elasticsearch 7 中使用 ngram 分词器找出大写单词
How to find out capital word with ngram tokenizer in Elasticsearch 7
我必须用 temp 或 温度
这是我的 ngram 分词器索引和一些示例文档
# index
PUT /demo
{
"settings": {
"index": {
"max_ngram_diff": "20",
"analysis": {
"analyzer": {
"account_analyzer": {
"tokenizer": "account_tokenizer"
}
},
"tokenizer": {
"account_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "1",
"type": "ngram",
"max_gram": "15"
}
}
}
}
},
"mappings": {
"properties": {
"account": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "account_analyzer",
"search_analyzer": "standard"
}
}
}
}
# docs
PUT /demo/_doc/1
{
"account": "temp123"
}
PUT /demo/_doc/2
{
"account": "TEMP456"
}
通过以下查询,我希望能取回这两个文档。但我只有 doc 1。
好像我无法获得带有大写单词的文档。
我应该如何使用 temp 或 TEMP 取回两个文档?
POST /demo/_search/
{
"query": {
"bool": {
"must": [
{
"match": {
"account": {
"query": "temp",
"fuzziness": "AUTO"
}
}
}
]
}
}
}
POST /demo/_search/
{
"query": {
"bool": {
"must": [
{
"match": {
"account": {
"query": "TEMP",
"fuzziness": "AUTO"
}
}
}
]
}
}
}
您可以使用 _analyze 检查分析器生成的标记。
GET demo/_analyze
{
"analyzer": "account_analyzer",
"text": ["TEMP123"]
}
"tokens" : [
{
"token" : "T",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "TE",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "TEM",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "TEMP",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 3
},
{
"token" : "TEMP1",
"start_offset" : 0,
"end_offset" : 5,
"type" : "word",
"position" : 4
},
{
"token" : "TEMP12",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 5
},
{
"token" : "TEMP123",
"start_offset" : 0,
"end_offset" : 7,
"type" : "word",
"position" : 6
},
{
"token" : "E",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 7
},
{
"token" : "EM",
"start_offset" : 1,
"end_offset" : 3,
"type" : "word",
"position" : 8
},
{
"token" : "EMP",
"start_offset" : 1,
"end_offset" : 4,
"type" : "word",
"position" : 9
},
{
"token" : "EMP1",
"start_offset" : 1,
"end_offset" : 5,
"type" : "word",
"position" : 10
},
{
"token" : "EMP12",
"start_offset" : 1,
"end_offset" : 6,
"type" : "word",
"position" : 11
},
{
"token" : "EMP123",
"start_offset" : 1,
"end_offset" : 7,
"type" : "word",
"position" : 12
},
{
"token" : "M",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 13
},
{
"token" : "MP",
"start_offset" : 2,
"end_offset" : 4,
"type" : "word",
"position" : 14
},
{
"token" : "MP1",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 15
},
{
"token" : "MP12",
"start_offset" : 2,
"end_offset" : 6,
"type" : "word",
"position" : 16
},
{
"token" : "MP123",
"start_offset" : 2,
"end_offset" : 7,
"type" : "word",
"position" : 17
},
{
"token" : "P",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 18
},
{
"token" : "P1",
"start_offset" : 3,
"end_offset" : 5,
"type" : "word",
"position" : 19
},
{
"token" : "P12",
"start_offset" : 3,
"end_offset" : 6,
"type" : "word",
"position" : 20
},
{
"token" : "P123",
"start_offset" : 3,
"end_offset" : 7,
"type" : "word",
"position" : 21
},
{
"token" : "1",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 22
},
{
"token" : "12",
"start_offset" : 4,
"end_offset" : 6,
"type" : "word",
"position" : 23
},
{
"token" : "123",
"start_offset" : 4,
"end_offset" : 7,
"type" : "word",
"position" : 24
},
{
"token" : "2",
"start_offset" : 5,
"end_offset" : 6,
"type" : "word",
"position" : 25
},
{
"token" : "23",
"start_offset" : 5,
"end_offset" : 7,
"type" : "word",
"position" : 26
},
{
"token" : "3",
"start_offset" : 6,
"end_offset" : 7,
"type" : "word",
"position" : 27
}
]
您需要向分析器添加一个小写过滤器,以便生成的所有标记都具有小写
{
"settings": {
"index": {
"max_ngram_diff": "20",
"analysis": {
"analyzer": {
"account_analyzer": {
"tokenizer": "account_tokenizer",
"filter": [ ----> note
"lowercase"
]
}
},
"tokenizer": {
"account_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "1",
"type": "ngram",
"max_gram": "15"
}
}
}
}
},
"mappings": {
"properties": {
"account": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "account_analyzer",
"search_analyzer": "standard"
}
}
}
}
我必须用 temp 或 温度
这是我的 ngram 分词器索引和一些示例文档
# index
PUT /demo
{
"settings": {
"index": {
"max_ngram_diff": "20",
"analysis": {
"analyzer": {
"account_analyzer": {
"tokenizer": "account_tokenizer"
}
},
"tokenizer": {
"account_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "1",
"type": "ngram",
"max_gram": "15"
}
}
}
}
},
"mappings": {
"properties": {
"account": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "account_analyzer",
"search_analyzer": "standard"
}
}
}
}
# docs
PUT /demo/_doc/1
{
"account": "temp123"
}
PUT /demo/_doc/2
{
"account": "TEMP456"
}
通过以下查询,我希望能取回这两个文档。但我只有 doc 1。 好像我无法获得带有大写单词的文档。
我应该如何使用 temp 或 TEMP 取回两个文档?
POST /demo/_search/
{
"query": {
"bool": {
"must": [
{
"match": {
"account": {
"query": "temp",
"fuzziness": "AUTO"
}
}
}
]
}
}
}
POST /demo/_search/
{
"query": {
"bool": {
"must": [
{
"match": {
"account": {
"query": "TEMP",
"fuzziness": "AUTO"
}
}
}
]
}
}
}
您可以使用 _analyze 检查分析器生成的标记。
GET demo/_analyze
{
"analyzer": "account_analyzer",
"text": ["TEMP123"]
}
"tokens" : [
{
"token" : "T",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "TE",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "TEM",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "TEMP",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 3
},
{
"token" : "TEMP1",
"start_offset" : 0,
"end_offset" : 5,
"type" : "word",
"position" : 4
},
{
"token" : "TEMP12",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 5
},
{
"token" : "TEMP123",
"start_offset" : 0,
"end_offset" : 7,
"type" : "word",
"position" : 6
},
{
"token" : "E",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 7
},
{
"token" : "EM",
"start_offset" : 1,
"end_offset" : 3,
"type" : "word",
"position" : 8
},
{
"token" : "EMP",
"start_offset" : 1,
"end_offset" : 4,
"type" : "word",
"position" : 9
},
{
"token" : "EMP1",
"start_offset" : 1,
"end_offset" : 5,
"type" : "word",
"position" : 10
},
{
"token" : "EMP12",
"start_offset" : 1,
"end_offset" : 6,
"type" : "word",
"position" : 11
},
{
"token" : "EMP123",
"start_offset" : 1,
"end_offset" : 7,
"type" : "word",
"position" : 12
},
{
"token" : "M",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 13
},
{
"token" : "MP",
"start_offset" : 2,
"end_offset" : 4,
"type" : "word",
"position" : 14
},
{
"token" : "MP1",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 15
},
{
"token" : "MP12",
"start_offset" : 2,
"end_offset" : 6,
"type" : "word",
"position" : 16
},
{
"token" : "MP123",
"start_offset" : 2,
"end_offset" : 7,
"type" : "word",
"position" : 17
},
{
"token" : "P",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 18
},
{
"token" : "P1",
"start_offset" : 3,
"end_offset" : 5,
"type" : "word",
"position" : 19
},
{
"token" : "P12",
"start_offset" : 3,
"end_offset" : 6,
"type" : "word",
"position" : 20
},
{
"token" : "P123",
"start_offset" : 3,
"end_offset" : 7,
"type" : "word",
"position" : 21
},
{
"token" : "1",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 22
},
{
"token" : "12",
"start_offset" : 4,
"end_offset" : 6,
"type" : "word",
"position" : 23
},
{
"token" : "123",
"start_offset" : 4,
"end_offset" : 7,
"type" : "word",
"position" : 24
},
{
"token" : "2",
"start_offset" : 5,
"end_offset" : 6,
"type" : "word",
"position" : 25
},
{
"token" : "23",
"start_offset" : 5,
"end_offset" : 7,
"type" : "word",
"position" : 26
},
{
"token" : "3",
"start_offset" : 6,
"end_offset" : 7,
"type" : "word",
"position" : 27
}
]
您需要向分析器添加一个小写过滤器,以便生成的所有标记都具有小写
{
"settings": {
"index": {
"max_ngram_diff": "20",
"analysis": {
"analyzer": {
"account_analyzer": {
"tokenizer": "account_tokenizer",
"filter": [ ----> note
"lowercase"
]
}
},
"tokenizer": {
"account_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"min_gram": "1",
"type": "ngram",
"max_gram": "15"
}
}
}
}
},
"mappings": {
"properties": {
"account": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "account_analyzer",
"search_analyzer": "standard"
}
}
}
}