测试 elasticsearch 自定义分析器 - 竖线分隔的关键字
testing an elasticsearch custom analyzer - pipe delimited keywords
我有此索引 pipe
作为自定义分析器。当我尝试测试它时,它 return 是每个字符,而不是管道分隔的单词。
我正在尝试构建一个用例,其中我的输入行 keywords
看起来像:crockpot refried beans|corningware replacement|crockpot lids|recipe refried beans
并且 EL 将 return 在展开后匹配。
{
"keywords": {
"aliases": {
},
"mappings": {
"cloud": {
"properties": {
"keywords": {
"type": "text",
"analyzer": "pipe"
}
}
}
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "keywords",
"creation_date": "1513890909384",
"analysis": {
"analyzer": {
"pipe": {
"type": "custom",
"tokenizer": "pipe"
}
},
"tokenizer": {
"pipe": {
"pattern": "|",
"type": "pattern"
}
}
},
"number_of_replicas": "1",
"uuid": "DOLV_FBbSC2CBU4p7oT3yw",
"version": {
"created": "6000099"
}
}
}
}
}
当我尝试按照此 guide.
对其进行测试时
curl -XPOST 'http://localhost:9200/keywords/_analyze' -d '{
"analyzer": "pipe",
"text": "pipe|pipe2"
}'
我得到了逐个字符的结果。
{
"tokens": [
{
"token": "p",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "i",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "p",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 2
},
{
"token": "e",
"start_offset": 3,
"end_offset": 4,
"type": "word",
"position": 3
},
干得好,你快完成了。由于管道 |
字符是正则表达式中的保留字符,因此需要像这样对其进行转义:
"tokenizer": {
"pipe": {
"pattern": "\|", <--- change this
"type": "pattern"
}
}
然后你的分析器将工作并产生这个:
{
"tokens": [
{
"token": "pipe",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0
},
{
"token": "pipe2",
"start_offset": 5,
"end_offset": 10,
"type": "word",
"position": 1
}
]
}
我有此索引 pipe
作为自定义分析器。当我尝试测试它时,它 return 是每个字符,而不是管道分隔的单词。
我正在尝试构建一个用例,其中我的输入行 keywords
看起来像:crockpot refried beans|corningware replacement|crockpot lids|recipe refried beans
并且 EL 将 return 在展开后匹配。
{
"keywords": {
"aliases": {
},
"mappings": {
"cloud": {
"properties": {
"keywords": {
"type": "text",
"analyzer": "pipe"
}
}
}
},
"settings": {
"index": {
"number_of_shards": "5",
"provided_name": "keywords",
"creation_date": "1513890909384",
"analysis": {
"analyzer": {
"pipe": {
"type": "custom",
"tokenizer": "pipe"
}
},
"tokenizer": {
"pipe": {
"pattern": "|",
"type": "pattern"
}
}
},
"number_of_replicas": "1",
"uuid": "DOLV_FBbSC2CBU4p7oT3yw",
"version": {
"created": "6000099"
}
}
}
}
}
当我尝试按照此 guide.
对其进行测试时curl -XPOST 'http://localhost:9200/keywords/_analyze' -d '{
"analyzer": "pipe",
"text": "pipe|pipe2"
}'
我得到了逐个字符的结果。
{
"tokens": [
{
"token": "p",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "i",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "p",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 2
},
{
"token": "e",
"start_offset": 3,
"end_offset": 4,
"type": "word",
"position": 3
},
干得好,你快完成了。由于管道 |
字符是正则表达式中的保留字符,因此需要像这样对其进行转义:
"tokenizer": {
"pipe": {
"pattern": "\|", <--- change this
"type": "pattern"
}
}
然后你的分析器将工作并产生这个:
{
"tokens": [
{
"token": "pipe",
"start_offset": 0,
"end_offset": 4,
"type": "word",
"position": 0
},
{
"token": "pipe2",
"start_offset": 5,
"end_offset": 10,
"type": "word",
"position": 1
}
]
}