Elasticsearch 自定义分析器,带有 ngram,连字符上没有单词定界符
Elasticsearch custom analyzer with ngram and without word delimiter on hyphens
我正在尝试索引包含连字符但不包含空格、句点或任何其他标点符号的字符串。我不想根据连字符拆分单词,而是希望连字符成为索引文本的一部分。
例如,我的 6 个文本字符串是:
- magazineplayon
- 马杂志
- 在线杂志
- 最佳杂志
- 杂志之友
- 杂志玩游戏
我希望能够在这些字符串中搜索包含 "play" 的 文本或以 "magazine" 开头的 文本.
我已经能够使用 ngram 使包含 "play" 的文本正常工作。但是,连字符导致文本拆分,并且它包括 "magazine" 在连字符后的单词中的结果。我只希望出现以 "magazine" 字符串开头的单词。
根据上面的示例,以 "magazine":
开头时应该只出现这 3 个
- magazineplayon
- 马杂志
- 杂志玩游戏
请帮助我的 ElasticSearch 索引示例:
DELETE /sample
PUT /sample
{
"settings": {
"index.number_of_shards":5,
"index.number_of_replicas": 0,
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
},
"word_delimiter_filter": {
"type": "word_delimiter",
"preserve_original": true,
"catenate_all" : true
}
},
"analyzer": {
"ngram_index_analyzer": {
"type" : "custom",
"tokenizer": "lowercase",
"filter" : ["nGram_filter", "word_delimiter_filter"]
}
}
}
}
}
PUT /sample/1/_create
{
"name" : "magazineplayon"
}
PUT /sample/3/_create
{
"name" : "magazineofhorses"
}
PUT /sample/4/_create
{
"name" : "online-magazine"
}
PUT /sample/5/_create
{
"name" : "best-magazine"
}
PUT /sample/6/_create
{
"name" : "friend-of-magazines"
}
PUT /sample/7/_create
{
"name" : "magazineplaygames"
}
GET /sample/_search
{
"query": {
"wildcard": {
"name": "*play*"
}
}
}
GET /sample/_search
{
"query": {
"wildcard": {
"name": "magazine*"
}
}
}
更新 1
我更新了所有创建语句以在示例后使用 TEST:
PUT /sample/test/7/_create
{
"name" : "magazinefairplay"
}
然后我 运行 以下命令 return 仅包含单词 "play" 的名称,而不是进行通配符搜索。这工作正常,return只编辑了两条记录。
POST /sample/test/_search
{
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{"match": { "name.substrings": "play" }}
]
}
}
}
我 运行 以下命令 return 仅以 "magazine" 开头的名称。我的预期是 "online-magazine"、"best-magazine" 和 "friend-of-magazines" 不会出现。然而,包括这三个在内的所有七条记录都被 returned。
POST /sample/test/_search
{
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{"match": { "name.prefixes": "magazine" }}
]
}
}
}
有没有办法过滤掉使用连字符的前缀?
您走在正确的道路上,但是,您还需要添加另一个利用 edge-ngram
token filter 的分析器以使 "starts with" 约束起作用。您可以保留 ngram
来检查 "contain" 给定单词的字段,但是您需要 edge-ngram
来检查字段 "starts with" 一些标记。
PUT /sample
{
"settings": {
"index.number_of_shards": 5,
"index.number_of_replicas": 0,
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
},
"edgenGram_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 20
}
},
"analyzer": {
"ngram_index_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase",
"nGram_filter"
]
},
"edge_ngram_index_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase",
"edgenGram_filter"
]
}
}
}
},
"mappings": {
"test": {
"properties": {
"name": {
"type": "string",
"fields": {
"prefixes": {
"type": "string",
"analyzer": "edge_ngram_index_analyzer",
"search_analyzer": "standard"
},
"substrings": {
"type": "string",
"analyzer": "ngram_index_analyzer",
"search_analyzer": "standard"
}
}
}
}
}
}
}
那么您的查询将变为(即搜索所有 name
字段包含 play
或以 magazine
开头的文档)
POST /sample/test/_search
{
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{"match": { "name.substrings": "play" }},
{"match": { "name.prefixes": "magazine" }}
]
}
}
}
我正在尝试索引包含连字符但不包含空格、句点或任何其他标点符号的字符串。我不想根据连字符拆分单词,而是希望连字符成为索引文本的一部分。
例如,我的 6 个文本字符串是:
- magazineplayon
- 马杂志
- 在线杂志
- 最佳杂志
- 杂志之友
- 杂志玩游戏
我希望能够在这些字符串中搜索包含 "play" 的 文本或以 "magazine" 开头的 文本.
我已经能够使用 ngram 使包含 "play" 的文本正常工作。但是,连字符导致文本拆分,并且它包括 "magazine" 在连字符后的单词中的结果。我只希望出现以 "magazine" 字符串开头的单词。
根据上面的示例,以 "magazine":
开头时应该只出现这 3 个- magazineplayon
- 马杂志
- 杂志玩游戏
请帮助我的 ElasticSearch 索引示例:
DELETE /sample
PUT /sample
{
"settings": {
"index.number_of_shards":5,
"index.number_of_replicas": 0,
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
},
"word_delimiter_filter": {
"type": "word_delimiter",
"preserve_original": true,
"catenate_all" : true
}
},
"analyzer": {
"ngram_index_analyzer": {
"type" : "custom",
"tokenizer": "lowercase",
"filter" : ["nGram_filter", "word_delimiter_filter"]
}
}
}
}
}
PUT /sample/1/_create
{
"name" : "magazineplayon"
}
PUT /sample/3/_create
{
"name" : "magazineofhorses"
}
PUT /sample/4/_create
{
"name" : "online-magazine"
}
PUT /sample/5/_create
{
"name" : "best-magazine"
}
PUT /sample/6/_create
{
"name" : "friend-of-magazines"
}
PUT /sample/7/_create
{
"name" : "magazineplaygames"
}
GET /sample/_search
{
"query": {
"wildcard": {
"name": "*play*"
}
}
}
GET /sample/_search
{
"query": {
"wildcard": {
"name": "magazine*"
}
}
}
更新 1 我更新了所有创建语句以在示例后使用 TEST:
PUT /sample/test/7/_create
{
"name" : "magazinefairplay"
}
然后我 运行 以下命令 return 仅包含单词 "play" 的名称,而不是进行通配符搜索。这工作正常,return只编辑了两条记录。
POST /sample/test/_search
{
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{"match": { "name.substrings": "play" }}
]
}
}
}
我 运行 以下命令 return 仅以 "magazine" 开头的名称。我的预期是 "online-magazine"、"best-magazine" 和 "friend-of-magazines" 不会出现。然而,包括这三个在内的所有七条记录都被 returned。
POST /sample/test/_search
{
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{"match": { "name.prefixes": "magazine" }}
]
}
}
}
有没有办法过滤掉使用连字符的前缀?
您走在正确的道路上,但是,您还需要添加另一个利用 edge-ngram
token filter 的分析器以使 "starts with" 约束起作用。您可以保留 ngram
来检查 "contain" 给定单词的字段,但是您需要 edge-ngram
来检查字段 "starts with" 一些标记。
PUT /sample
{
"settings": {
"index.number_of_shards": 5,
"index.number_of_replicas": 0,
"analysis": {
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit"
]
},
"edgenGram_filter": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 20
}
},
"analyzer": {
"ngram_index_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase",
"nGram_filter"
]
},
"edge_ngram_index_analyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase",
"edgenGram_filter"
]
}
}
}
},
"mappings": {
"test": {
"properties": {
"name": {
"type": "string",
"fields": {
"prefixes": {
"type": "string",
"analyzer": "edge_ngram_index_analyzer",
"search_analyzer": "standard"
},
"substrings": {
"type": "string",
"analyzer": "ngram_index_analyzer",
"search_analyzer": "standard"
}
}
}
}
}
}
}
那么您的查询将变为(即搜索所有 name
字段包含 play
或以 magazine
开头的文档)
POST /sample/test/_search
{
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{"match": { "name.substrings": "play" }},
{"match": { "name.prefixes": "magazine" }}
]
}
}
}