Elasticsearch 6.8 match_phrase 搜索 N-gram 分词器效果不佳
Elasticsearch 6.8 match_phrase search N-gram tokenizer works not well
我使用 Elasticsearch N-gram tokenizer
并使用 match_phrase
进行模糊匹配
我的索引和测试数据如下:
DELETE /m8
PUT m8
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 1,
"max_gram": 3,
"custom_token_chars":"_."
}
}
},
"max_ngram_diff": 10
},
"mappings": {
"table": {
"properties": {
"dataSourceId": {
"type": "long"
},
"dataSourceType": {
"type": "integer"
},
"dbName": {
"type": "text",
"analyzer": "my_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
PUT /m8/table/1
{
"dataSourceId":1,
"dataSourceType":2,
"dbName":"rm.rf"
}
PUT /m8/table/2
{
"dataSourceId":1,
"dataSourceType":2,
"dbName":"rm_rf"
}
PUT /m8/table/3
{
"dataSourceId":1,
"dataSourceType":2,
"dbName":"rmrf"
}
检查_analyze:
POST m8/_analyze
{
"tokenizer": "my_tokenizer",
"text": "rm.rf"
}
_分析结果:
{
"tokens" : [
{
"token" : "r",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "rm",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "rm.",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "m",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 3
},
{
"token" : "m.",
"start_offset" : 1,
"end_offset" : 3,
"type" : "word",
"position" : 4
},
{
"token" : "m.r",
"start_offset" : 1,
"end_offset" : 4,
"type" : "word",
"position" : 5
},
{
"token" : ".",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 6
},
{
"token" : ".r",
"start_offset" : 2,
"end_offset" : 4,
"type" : "word",
"position" : 7
},
{
"token" : ".rf",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 8
},
{
"token" : "r",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 9
},
{
"token" : "rf",
"start_offset" : 3,
"end_offset" : 5,
"type" : "word",
"position" : 10
},
{
"token" : "f",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 11
}
]
}
当我搜索 'rm' 时,没有找到:
GET /m8/table/_search
{
"query": {
"bool": {
"must": [
{
"match_phrase": {
"dbName": "rm"
}
}
]
}
}
}
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 0,
"max_score" : null,
"hits" : [ ]
}
}
但是可以找到“.rf”:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.7260926,
"hits" : [
{
"_index" : "m8",
"_type" : "table",
"_id" : "1",
"_score" : 1.7260926,
"_source" : {
"dataSourceId" : 1,
"dataSourceType" : 2,
"dbName" : "rm.rf"
}
}
]
}
}
我的问题:
为什么'rm'连_analyze拆分了这些词组也找不到?
my_analyzer 也将在搜索期间使用。
"mapping":{
"dbName": {
"type": "text",
"analyzer": "my_analyzer"
"search_analyzer":"my_analyzer" // <==== If you don't provide a search analyzer then what you defined in analyzer will be used during search time as well.
Match_phrase 查询用于根据分析文本的位置来匹配短语。例如,搜索“Kal ho”将匹配分析文本中位置 X 处有“Kal”和位置 X+1 处有“ho”的文档。
当您搜索 'rm' (#1) 时,使用 my_analyzer 分析文本,将其转换为 n-gram 并在其顶部phrase_search 将被使用。因此结果不是预期的。
解法:
使用带有简单匹配查询的标准分析器
GET /m8/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"dbName": {
"query": "rm",
"analyzer": "standard" // <=========
}
}
}
]
}
}
}
OR 在映射期间定义并使用匹配查询(不是 match_phrase)
"mapping":{
"dbName": {
"type": "text",
"analyzer": "my_analyzer"
"search_analyzer":"standard" //<==========
后续问题:为什么要将 match_phrase 查询与 n-gram 分词器一起使用?
我使用 Elasticsearch N-gram tokenizer
并使用 match_phrase
进行模糊匹配
我的索引和测试数据如下:
DELETE /m8
PUT m8
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 1,
"max_gram": 3,
"custom_token_chars":"_."
}
}
},
"max_ngram_diff": 10
},
"mappings": {
"table": {
"properties": {
"dataSourceId": {
"type": "long"
},
"dataSourceType": {
"type": "integer"
},
"dbName": {
"type": "text",
"analyzer": "my_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
PUT /m8/table/1
{
"dataSourceId":1,
"dataSourceType":2,
"dbName":"rm.rf"
}
PUT /m8/table/2
{
"dataSourceId":1,
"dataSourceType":2,
"dbName":"rm_rf"
}
PUT /m8/table/3
{
"dataSourceId":1,
"dataSourceType":2,
"dbName":"rmrf"
}
检查_analyze:
POST m8/_analyze
{
"tokenizer": "my_tokenizer",
"text": "rm.rf"
}
_分析结果:
{
"tokens" : [
{
"token" : "r",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "rm",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "rm.",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "m",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 3
},
{
"token" : "m.",
"start_offset" : 1,
"end_offset" : 3,
"type" : "word",
"position" : 4
},
{
"token" : "m.r",
"start_offset" : 1,
"end_offset" : 4,
"type" : "word",
"position" : 5
},
{
"token" : ".",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 6
},
{
"token" : ".r",
"start_offset" : 2,
"end_offset" : 4,
"type" : "word",
"position" : 7
},
{
"token" : ".rf",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 8
},
{
"token" : "r",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 9
},
{
"token" : "rf",
"start_offset" : 3,
"end_offset" : 5,
"type" : "word",
"position" : 10
},
{
"token" : "f",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 11
}
]
}
当我搜索 'rm' 时,没有找到:
GET /m8/table/_search
{
"query": {
"bool": {
"must": [
{
"match_phrase": {
"dbName": "rm"
}
}
]
}
}
}
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 0,
"max_score" : null,
"hits" : [ ]
}
}
但是可以找到“.rf”:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.7260926,
"hits" : [
{
"_index" : "m8",
"_type" : "table",
"_id" : "1",
"_score" : 1.7260926,
"_source" : {
"dataSourceId" : 1,
"dataSourceType" : 2,
"dbName" : "rm.rf"
}
}
]
}
}
我的问题: 为什么'rm'连_analyze拆分了这些词组也找不到?
my_analyzer 也将在搜索期间使用。
"mapping":{ "dbName": { "type": "text", "analyzer": "my_analyzer" "search_analyzer":"my_analyzer" // <==== If you don't provide a search analyzer then what you defined in analyzer will be used during search time as well.
Match_phrase 查询用于根据分析文本的位置来匹配短语。例如,搜索“Kal ho”将匹配分析文本中位置 X 处有“Kal”和位置 X+1 处有“ho”的文档。
当您搜索 'rm' (#1) 时,使用 my_analyzer 分析文本,将其转换为 n-gram 并在其顶部phrase_search 将被使用。因此结果不是预期的。
解法:
使用带有简单匹配查询的标准分析器
GET /m8/_search { "query": { "bool": { "must": [ { "match": { "dbName": { "query": "rm", "analyzer": "standard" // <========= } } } ] } } }
OR 在映射期间定义并使用匹配查询(不是 match_phrase)
"mapping":{ "dbName": { "type": "text", "analyzer": "my_analyzer" "search_analyzer":"standard" //<==========
后续问题:为什么要将 match_phrase 查询与 n-gram 分词器一起使用?