Elastic Search 多词同义词无法按预期工作
Elastic Search multi word synonyms not working as I'd expected
我有一个 ElasticSearch 搜索引擎,我正在为其添加同义词支持。 unigram 同义词一切顺利,但是当开始处理多词同义词时,一切都搞砸了。
例如,我想要以下查询 - "ice cream" 到 return 每个文档谈论 "ice cream" 或 "gelato" 或 "icecream"。
我的映射设置如下
PUT stam_test_1
{
"settings": {
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"plural_stemmer": {
"name": "minimal_english",
"type": "stemmer"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"english_graph_synonyms": {
"type": "synonym_graph",
"tokenizer": "standard",
"expand": true,
"synonyms": [
"ice cream, icecream, creamery, gelato",
"dim sum, dim sim, dimsim",
"ube, purple yam",
"sf, san francisco"
]
},
"english_synonyms": {
"type": "synonym",
"expand": true,
"tokenizer": "standard",
"synonyms": [
"burger, hamburger, slider",
"chicken, pollo",
"pork, pig, porc",
"barbeque, bbq, barbecue",
"sauce, dressing"
]
}
},
"analyzer": {
"english": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"plural_stemmer",
"english_stop",
"english_stemmer",
"asciifolding",
"english_synonyms"
]
},
"english_search": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"plural_stemmer",
"english_stop",
"english_stemmer",
"asciifolding",
"english_graph_synonyms"
]
}
}
}
},
"mappings": {
"properties": {
"text_field": {
"type": "text",
"fields": {
"post_text": {
"type": "text",
"analyzer": "english",
"search_analyzer": "english_search"
}
}
}
}
}
}
我正在添加一些文档
POST _bulk
{ "index" : { "_index" : "stam_test_1", "_id" : "1" } }
{ "post_text" : "Love this ice cream so much!!!"}
{ "index" : { "_index" : "stam_test_1", "_id" : "2" } }
{ "post_text" : "Great gelato and a tasty burger"}
{ "index" : { "_index" : "stam_test_1", "_id" : "3" } }
{ "post_text" : "I bought coke but did not get any ice with it" }
{ "index" : { "_index" : "stam_test_1", "_id" : "4" } }
{ "post_text" : "ic cream" }
当我查询 "ice cream"
获取 /stam_test_1/_search
{
"query": {
"match": {
"post_text": {
"query": "ice cream",
"analyzer": "english_search"}
}
}
}
我得到以下结果
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 2.6678555,
"hits" : [
{
"_index" : "stam_test_1",
"_type" : "_doc",
"_id" : "10",
"_score" : 2.6678555,
"_source" : {
"post_text" : "ic cream"
}
},
{
"_index" : "stam_test_1",
"_type" : "_doc",
"_id" : "2",
"_score" : 0.6931472,
"_source" : {
"post_text" : "Great gelato and a tasty burger"
}
}
]
}
}
您可以看到,我有意添加了一个已经词干化的文档 - "ic cream",正如我所怀疑的那样 return,但我没有得到第一个文档 "Love this ice cream so much!!!"。
当我在 "ice cream"
上直接测试分析器时
GET stam_test_1/_analyze?
{
"analyzer": "english_search",
"text" : "ice cream"
}
它returns
{
"tokens" : [
{
"token" : "icecream",
"start_offset" : 0,
"end_offset" : 9,
"type" : "SYNONYM",
"position" : 0,
"positionLength" : 2
},
{
"token" : "softserv",
"start_offset" : 0,
"end_offset" : 9,
"type" : "SYNONYM",
"position" : 0,
"positionLength" : 2
},
{
"token" : "icream",
"start_offset" : 0,
"end_offset" : 9,
"type" : "SYNONYM",
"position" : 0,
"positionLength" : 2
},
{
"token" : "creameri",
"start_offset" : 0,
"end_offset" : 9,
"type" : "SYNONYM",
"position" : 0,
"positionLength" : 2
},
{
"token" : "gelato",
"start_offset" : 0,
"end_offset" : 9,
"type" : "SYNONYM",
"position" : 0,
"positionLength" : 2
},
{
"token" : "ic",
"start_offset" : 0,
"end_offset" : 3,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "cream",
"start_offset" : 4,
"end_offset" : 9,
"type" : "<ALPHANUM>",
"position" : 1
}
]
}
单词同义词 return 正确,但多词被词干化(每个标记,单独)并且似乎实际文档没有被词干化(这就是为什么我得到 'ic cream' 文件)。
我确定这只是错误设置的定义。我试图用 "keyword" 而不是 "standard" 替换 english_search 分析器的分词器,但也没有成功。
关于如何处理这个问题有什么建议吗? synonyms_graph 功能的文档和 Google 结果非常少。
所以我的错误是映射定义。我不应该定义字段,我所要做的就是使用以下映射,这样就可以正常工作了
"mappings": {
"properties": {
"post_text": {
"type": "text",
"analyzer": "english",
"search_analyzer": "english_search"
}
}
}
我有一个 ElasticSearch 搜索引擎,我正在为其添加同义词支持。 unigram 同义词一切顺利,但是当开始处理多词同义词时,一切都搞砸了。
例如,我想要以下查询 - "ice cream" 到 return 每个文档谈论 "ice cream" 或 "gelato" 或 "icecream"。
我的映射设置如下
PUT stam_test_1
{
"settings": {
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"plural_stemmer": {
"name": "minimal_english",
"type": "stemmer"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"english_graph_synonyms": {
"type": "synonym_graph",
"tokenizer": "standard",
"expand": true,
"synonyms": [
"ice cream, icecream, creamery, gelato",
"dim sum, dim sim, dimsim",
"ube, purple yam",
"sf, san francisco"
]
},
"english_synonyms": {
"type": "synonym",
"expand": true,
"tokenizer": "standard",
"synonyms": [
"burger, hamburger, slider",
"chicken, pollo",
"pork, pig, porc",
"barbeque, bbq, barbecue",
"sauce, dressing"
]
}
},
"analyzer": {
"english": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"plural_stemmer",
"english_stop",
"english_stemmer",
"asciifolding",
"english_synonyms"
]
},
"english_search": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"plural_stemmer",
"english_stop",
"english_stemmer",
"asciifolding",
"english_graph_synonyms"
]
}
}
}
},
"mappings": {
"properties": {
"text_field": {
"type": "text",
"fields": {
"post_text": {
"type": "text",
"analyzer": "english",
"search_analyzer": "english_search"
}
}
}
}
}
}
我正在添加一些文档
POST _bulk
{ "index" : { "_index" : "stam_test_1", "_id" : "1" } }
{ "post_text" : "Love this ice cream so much!!!"}
{ "index" : { "_index" : "stam_test_1", "_id" : "2" } }
{ "post_text" : "Great gelato and a tasty burger"}
{ "index" : { "_index" : "stam_test_1", "_id" : "3" } }
{ "post_text" : "I bought coke but did not get any ice with it" }
{ "index" : { "_index" : "stam_test_1", "_id" : "4" } }
{ "post_text" : "ic cream" }
当我查询 "ice cream" 获取 /stam_test_1/_search
{
"query": {
"match": {
"post_text": {
"query": "ice cream",
"analyzer": "english_search"}
}
}
}
我得到以下结果
{
"took" : 4,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 2,
"relation" : "eq"
},
"max_score" : 2.6678555,
"hits" : [
{
"_index" : "stam_test_1",
"_type" : "_doc",
"_id" : "10",
"_score" : 2.6678555,
"_source" : {
"post_text" : "ic cream"
}
},
{
"_index" : "stam_test_1",
"_type" : "_doc",
"_id" : "2",
"_score" : 0.6931472,
"_source" : {
"post_text" : "Great gelato and a tasty burger"
}
}
]
}
}
您可以看到,我有意添加了一个已经词干化的文档 - "ic cream",正如我所怀疑的那样 return,但我没有得到第一个文档 "Love this ice cream so much!!!"。
当我在 "ice cream"
上直接测试分析器时GET stam_test_1/_analyze?
{
"analyzer": "english_search",
"text" : "ice cream"
}
它returns
{
"tokens" : [
{
"token" : "icecream",
"start_offset" : 0,
"end_offset" : 9,
"type" : "SYNONYM",
"position" : 0,
"positionLength" : 2
},
{
"token" : "softserv",
"start_offset" : 0,
"end_offset" : 9,
"type" : "SYNONYM",
"position" : 0,
"positionLength" : 2
},
{
"token" : "icream",
"start_offset" : 0,
"end_offset" : 9,
"type" : "SYNONYM",
"position" : 0,
"positionLength" : 2
},
{
"token" : "creameri",
"start_offset" : 0,
"end_offset" : 9,
"type" : "SYNONYM",
"position" : 0,
"positionLength" : 2
},
{
"token" : "gelato",
"start_offset" : 0,
"end_offset" : 9,
"type" : "SYNONYM",
"position" : 0,
"positionLength" : 2
},
{
"token" : "ic",
"start_offset" : 0,
"end_offset" : 3,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "cream",
"start_offset" : 4,
"end_offset" : 9,
"type" : "<ALPHANUM>",
"position" : 1
}
]
}
单词同义词 return 正确,但多词被词干化(每个标记,单独)并且似乎实际文档没有被词干化(这就是为什么我得到 'ic cream' 文件)。
我确定这只是错误设置的定义。我试图用 "keyword" 而不是 "standard" 替换 english_search 分析器的分词器,但也没有成功。
关于如何处理这个问题有什么建议吗? synonyms_graph 功能的文档和 Google 结果非常少。
所以我的错误是映射定义。我不应该定义字段,我所要做的就是使用以下映射,这样就可以正常工作了
"mappings": {
"properties": {
"post_text": {
"type": "text",
"analyzer": "english",
"search_analyzer": "english_search"
}
}
}