ElasticSearch - edgeNGram 分词器的问题
ElasticSearch - Problems with edgeNGram tokenizer
我使用 ElasticSearch 为数据库建立索引。我正在尝试使用 edgeNGram 分词器将字符串剪切为符合要求 "new string must be longer then 4 chars" 的字符串。
我使用以下代码创建索引:
PUT test
POST /test/_close
PUT /test/_settings
{
"analysis": {
"analyzer": {
"index_edge_ngram" : {
"type": "custom",
"filter": ["custom_word_delimiter"],
"tokenizer" : "left_tokenizer"
}
},
"filter" : {
"custom_word_delimiter" : {
"type": "word_delimiter",
"generate_word_parts": "true",
"generate_number_parts": "true",
"catenate_words": "false",
"catenate_numbers": "false",
"catenate_all": "false",
"split_on_case_change": "false",
"preserve_original": "false",
"split_on_numerics": "true",
"ignore_case": "true"
}
},
"tokenizer" : {
"left_tokenizer" : {
"max_gram" : 30,
"min_gram" : 5,
"type" : "edgeNGram"
}
}
}
}
POST /test/_open
现在我 运行 测试以概述结果
GET /test/_analyze?analyzer=index_edge_ngram&text=please pay for multiple wins with only one payment
并得到结果
{
"tokens": [
{
"token": "pleas",
"start_offset": 0,
"end_offset": 5,
"type": "word",
"position": 1
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 2
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 3
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 4
},
{
"token": "p",
"start_offset": 7,
"end_offset": 8,
"type": "word",
"position": 5
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 6
},
{
"token": "pa",
"start_offset": 7,
"end_offset": 9,
"type": "word",
"position": 7
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 8
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 9
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 10
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 11
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 12
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 13
},
{
"token": "f",
"start_offset": 11,
"end_offset": 12,
"type": "word",
"position": 14
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 15
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 16
},
{
"token": "fo",
"start_offset": 11,
"end_offset": 13,
"type": "word",
"position": 17
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 18
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 19
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 20
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 21
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 22
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 23
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 24
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 25
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 26
},
{
"token": "m",
"start_offset": 15,
"end_offset": 16,
"type": "word",
"position": 27
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 28
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 29
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 30
},
{
"token": "mu",
"start_offset": 15,
"end_offset": 17,
"type": "word",
"position": 31
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 32
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 33
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 34
},
{
"token": "mul",
"start_offset": 15,
"end_offset": 18,
"type": "word",
"position": 35
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 36
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 37
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 38
},
{
"token": "mult",
"start_offset": 15,
"end_offset": 19,
"type": "word",
"position": 39
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 40
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 41
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 42
},
{
"token": "multi",
"start_offset": 15,
"end_offset": 20,
"type": "word",
"position": 43
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 44
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 45
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 46
},
{
"token": "multip",
"start_offset": 15,
"end_offset": 21,
"type": "word",
"position": 47
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 48
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 49
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 50
},
{
"token": "multipl",
"start_offset": 15,
"end_offset": 22,
"type": "word",
"position": 51
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 52
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 53
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 54
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 55
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 56
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 57
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 58
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 59
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 60
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 61
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 62
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 63
},
{
"token": "w",
"start_offset": 24,
"end_offset": 25,
"type": "word",
"position": 64
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 65
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 66
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 67
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 68
},
{
"token": "wi",
"start_offset": 24,
"end_offset": 26,
"type": "word",
"position": 69
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 70
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 71
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 72
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 73
},
{
"token": "win",
"start_offset": 24,
"end_offset": 27,
"type": "word",
"position": 74
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 75
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 76
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 77
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 78
},
{
"token": "wins",
"start_offset": 24,
"end_offset": 28,
"type": "word",
"position": 79
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 80
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 81
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 82
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 83
},
{
"token": "wins",
"start_offset": 24,
"end_offset": 28,
"type": "word",
"position": 84
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 85
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 86
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 87
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 88
},
{
"token": "wins",
"start_offset": 24,
"end_offset": 28,
"type": "word",
"position": 89
},
{
"token": "w",
"start_offset": 29,
"end_offset": 30,
"type": "word",
"position": 90
}
]
}
这是我的问题:
为什么有令牌射手那么5个字符?
为什么"position" 属性显示的是token的位置,而不是单词在文本中的位置?看起来其他分词器也是这样工作的。
为什么输出的不是所有的单词?看起来它停在 "wins".
为什么同一个token重复这么多?
构建自定义分析器时,值得一步一步检查分析链中每一步生成的内容:
- 首先,tokenizer 将您的输入切片并切块成标记
- 然后标记过滤器将步骤 1 中的标记作为输入并执行它们的操作
- 最终应用字符过滤器
在你的例子中,如果你检查分词器阶段的结果,它是这样的。看到我们只是指定 tokenizer
(即 left_tokenizer
)作为参数。
curl -XGET 'localhost:9201/test/_analyze?tokenizer=left_tokenizer&pretty' -d 'please pay for multiple wins with only one payment'
结果是:
{
"tokens" : [ {
"token" : "pleas",
"start_offset" : 0,
"end_offset" : 5,
"type" : "word",
"position" : 1
}, {
"token" : "please",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 2
}, {
"token" : "please ",
"start_offset" : 0,
"end_offset" : 7,
"type" : "word",
"position" : 3
}, {
"token" : "please p",
"start_offset" : 0,
"end_offset" : 8,
"type" : "word",
"position" : 4
}, {
"token" : "please pa",
"start_offset" : 0,
"end_offset" : 9,
"type" : "word",
"position" : 5
}, {
"token" : "please pay",
"start_offset" : 0,
"end_offset" : 10,
"type" : "word",
"position" : 6
}, {
"token" : "please pay ",
"start_offset" : 0,
"end_offset" : 11,
"type" : "word",
"position" : 7
}, {
"token" : "please pay f",
"start_offset" : 0,
"end_offset" : 12,
"type" : "word",
"position" : 8
}, {
"token" : "please pay fo",
"start_offset" : 0,
"end_offset" : 13,
"type" : "word",
"position" : 9
}, {
"token" : "please pay for",
"start_offset" : 0,
"end_offset" : 14,
"type" : "word",
"position" : 10
}, {
"token" : "please pay for ",
"start_offset" : 0,
"end_offset" : 15,
"type" : "word",
"position" : 11
}, {
"token" : "please pay for m",
"start_offset" : 0,
"end_offset" : 16,
"type" : "word",
"position" : 12
}, {
"token" : "please pay for mu",
"start_offset" : 0,
"end_offset" : 17,
"type" : "word",
"position" : 13
}, {
"token" : "please pay for mul",
"start_offset" : 0,
"end_offset" : 18,
"type" : "word",
"position" : 14
}, {
"token" : "please pay for mult",
"start_offset" : 0,
"end_offset" : 19,
"type" : "word",
"position" : 15
}, {
"token" : "please pay for multi",
"start_offset" : 0,
"end_offset" : 20,
"type" : "word",
"position" : 16
}, {
"token" : "please pay for multip",
"start_offset" : 0,
"end_offset" : 21,
"type" : "word",
"position" : 17
}, {
"token" : "please pay for multipl",
"start_offset" : 0,
"end_offset" : 22,
"type" : "word",
"position" : 18
}, {
"token" : "please pay for multiple",
"start_offset" : 0,
"end_offset" : 23,
"type" : "word",
"position" : 19
"position" : 20
}, {
"token" : "please pay for multiple w",
"start_offset" : 0,
"end_offset" : 25,
"type" : "word",
"position" : 21
}, {
"token" : "please pay for multiple wi",
"start_offset" : 0,
"end_offset" : 26,
"type" : "word",
"position" : 22
}, {
"token" : "please pay for multiple win",
"start_offset" : 0,
"end_offset" : 27,
"type" : "word",
"position" : 23
}, {
"token" : "please pay for multiple wins",
"start_offset" : 0,
"end_offset" : 28,
"type" : "word",
"position" : 24
}, {
"token" : "please pay for multiple wins ",
"start_offset" : 0,
"end_offset" : 29,
"type" : "word",
"position" : 25
}, {
"token" : "please pay for multiple wins w",
"start_offset" : 0,
"end_offset" : 30,
"type" : "word",
"position" : 26
} ]
}
然后,您的令牌过滤器将采用上面的每个令牌并完成它们的工作。例如,
- 第一个标记
pleas
会出现 pleas
- 第二个令牌
please
作为please
- 第三个标记
please
(注意末尾的space),如please
- 第四个标记
please p
作为两个标记please
和p
- 第五个令牌
please pa
作为两个令牌please
和pa
- 等等
因此,您的 left_tokenizer
将整个句子视为单个标记输入并将其从 5 个字符标记化为 30 个字符,这就是为什么它在 wins
处停止(回答问题 3)
正如您在上面看到的,一些标记是重复的,因为 word_delimiter
标记过滤器隔离地处理来自标记器的每个标记,因此 "duplicates"(回答问题 4)和标记更短超过 5 个字符(回答问题 1)
我认为这不是您希望它工作的方式,但从您的问题中不清楚您希望它如何工作,即您希望能够进行的搜索类型。我在这里提供的只是对您所看到内容的解释。
我使用 ElasticSearch 为数据库建立索引。我正在尝试使用 edgeNGram 分词器将字符串剪切为符合要求 "new string must be longer then 4 chars" 的字符串。 我使用以下代码创建索引:
PUT test
POST /test/_close
PUT /test/_settings
{
"analysis": {
"analyzer": {
"index_edge_ngram" : {
"type": "custom",
"filter": ["custom_word_delimiter"],
"tokenizer" : "left_tokenizer"
}
},
"filter" : {
"custom_word_delimiter" : {
"type": "word_delimiter",
"generate_word_parts": "true",
"generate_number_parts": "true",
"catenate_words": "false",
"catenate_numbers": "false",
"catenate_all": "false",
"split_on_case_change": "false",
"preserve_original": "false",
"split_on_numerics": "true",
"ignore_case": "true"
}
},
"tokenizer" : {
"left_tokenizer" : {
"max_gram" : 30,
"min_gram" : 5,
"type" : "edgeNGram"
}
}
}
}
POST /test/_open
现在我 运行 测试以概述结果
GET /test/_analyze?analyzer=index_edge_ngram&text=please pay for multiple wins with only one payment
并得到结果
{
"tokens": [
{
"token": "pleas",
"start_offset": 0,
"end_offset": 5,
"type": "word",
"position": 1
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 2
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 3
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 4
},
{
"token": "p",
"start_offset": 7,
"end_offset": 8,
"type": "word",
"position": 5
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 6
},
{
"token": "pa",
"start_offset": 7,
"end_offset": 9,
"type": "word",
"position": 7
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 8
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 9
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 10
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 11
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 12
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 13
},
{
"token": "f",
"start_offset": 11,
"end_offset": 12,
"type": "word",
"position": 14
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 15
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 16
},
{
"token": "fo",
"start_offset": 11,
"end_offset": 13,
"type": "word",
"position": 17
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 18
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 19
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 20
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 21
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 22
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 23
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 24
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 25
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 26
},
{
"token": "m",
"start_offset": 15,
"end_offset": 16,
"type": "word",
"position": 27
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 28
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 29
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 30
},
{
"token": "mu",
"start_offset": 15,
"end_offset": 17,
"type": "word",
"position": 31
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 32
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 33
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 34
},
{
"token": "mul",
"start_offset": 15,
"end_offset": 18,
"type": "word",
"position": 35
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 36
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 37
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 38
},
{
"token": "mult",
"start_offset": 15,
"end_offset": 19,
"type": "word",
"position": 39
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 40
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 41
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 42
},
{
"token": "multi",
"start_offset": 15,
"end_offset": 20,
"type": "word",
"position": 43
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 44
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 45
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 46
},
{
"token": "multip",
"start_offset": 15,
"end_offset": 21,
"type": "word",
"position": 47
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 48
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 49
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 50
},
{
"token": "multipl",
"start_offset": 15,
"end_offset": 22,
"type": "word",
"position": 51
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 52
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 53
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 54
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 55
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 56
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 57
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 58
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 59
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 60
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 61
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 62
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 63
},
{
"token": "w",
"start_offset": 24,
"end_offset": 25,
"type": "word",
"position": 64
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 65
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 66
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 67
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 68
},
{
"token": "wi",
"start_offset": 24,
"end_offset": 26,
"type": "word",
"position": 69
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 70
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 71
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 72
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 73
},
{
"token": "win",
"start_offset": 24,
"end_offset": 27,
"type": "word",
"position": 74
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 75
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 76
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 77
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 78
},
{
"token": "wins",
"start_offset": 24,
"end_offset": 28,
"type": "word",
"position": 79
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 80
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 81
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 82
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 83
},
{
"token": "wins",
"start_offset": 24,
"end_offset": 28,
"type": "word",
"position": 84
},
{
"token": "please",
"start_offset": 0,
"end_offset": 6,
"type": "word",
"position": 85
},
{
"token": "pay",
"start_offset": 7,
"end_offset": 10,
"type": "word",
"position": 86
},
{
"token": "for",
"start_offset": 11,
"end_offset": 14,
"type": "word",
"position": 87
},
{
"token": "multiple",
"start_offset": 15,
"end_offset": 23,
"type": "word",
"position": 88
},
{
"token": "wins",
"start_offset": 24,
"end_offset": 28,
"type": "word",
"position": 89
},
{
"token": "w",
"start_offset": 29,
"end_offset": 30,
"type": "word",
"position": 90
}
]
}
这是我的问题:
为什么有令牌射手那么5个字符?
为什么"position" 属性显示的是token的位置,而不是单词在文本中的位置?看起来其他分词器也是这样工作的。
为什么输出的不是所有的单词?看起来它停在 "wins".
为什么同一个token重复这么多?
构建自定义分析器时,值得一步一步检查分析链中每一步生成的内容:
- 首先,tokenizer 将您的输入切片并切块成标记
- 然后标记过滤器将步骤 1 中的标记作为输入并执行它们的操作
- 最终应用字符过滤器
在你的例子中,如果你检查分词器阶段的结果,它是这样的。看到我们只是指定 tokenizer
(即 left_tokenizer
)作为参数。
curl -XGET 'localhost:9201/test/_analyze?tokenizer=left_tokenizer&pretty' -d 'please pay for multiple wins with only one payment'
结果是:
{
"tokens" : [ {
"token" : "pleas",
"start_offset" : 0,
"end_offset" : 5,
"type" : "word",
"position" : 1
}, {
"token" : "please",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 2
}, {
"token" : "please ",
"start_offset" : 0,
"end_offset" : 7,
"type" : "word",
"position" : 3
}, {
"token" : "please p",
"start_offset" : 0,
"end_offset" : 8,
"type" : "word",
"position" : 4
}, {
"token" : "please pa",
"start_offset" : 0,
"end_offset" : 9,
"type" : "word",
"position" : 5
}, {
"token" : "please pay",
"start_offset" : 0,
"end_offset" : 10,
"type" : "word",
"position" : 6
}, {
"token" : "please pay ",
"start_offset" : 0,
"end_offset" : 11,
"type" : "word",
"position" : 7
}, {
"token" : "please pay f",
"start_offset" : 0,
"end_offset" : 12,
"type" : "word",
"position" : 8
}, {
"token" : "please pay fo",
"start_offset" : 0,
"end_offset" : 13,
"type" : "word",
"position" : 9
}, {
"token" : "please pay for",
"start_offset" : 0,
"end_offset" : 14,
"type" : "word",
"position" : 10
}, {
"token" : "please pay for ",
"start_offset" : 0,
"end_offset" : 15,
"type" : "word",
"position" : 11
}, {
"token" : "please pay for m",
"start_offset" : 0,
"end_offset" : 16,
"type" : "word",
"position" : 12
}, {
"token" : "please pay for mu",
"start_offset" : 0,
"end_offset" : 17,
"type" : "word",
"position" : 13
}, {
"token" : "please pay for mul",
"start_offset" : 0,
"end_offset" : 18,
"type" : "word",
"position" : 14
}, {
"token" : "please pay for mult",
"start_offset" : 0,
"end_offset" : 19,
"type" : "word",
"position" : 15
}, {
"token" : "please pay for multi",
"start_offset" : 0,
"end_offset" : 20,
"type" : "word",
"position" : 16
}, {
"token" : "please pay for multip",
"start_offset" : 0,
"end_offset" : 21,
"type" : "word",
"position" : 17
}, {
"token" : "please pay for multipl",
"start_offset" : 0,
"end_offset" : 22,
"type" : "word",
"position" : 18
}, {
"token" : "please pay for multiple",
"start_offset" : 0,
"end_offset" : 23,
"type" : "word",
"position" : 19
"position" : 20
}, {
"token" : "please pay for multiple w",
"start_offset" : 0,
"end_offset" : 25,
"type" : "word",
"position" : 21
}, {
"token" : "please pay for multiple wi",
"start_offset" : 0,
"end_offset" : 26,
"type" : "word",
"position" : 22
}, {
"token" : "please pay for multiple win",
"start_offset" : 0,
"end_offset" : 27,
"type" : "word",
"position" : 23
}, {
"token" : "please pay for multiple wins",
"start_offset" : 0,
"end_offset" : 28,
"type" : "word",
"position" : 24
}, {
"token" : "please pay for multiple wins ",
"start_offset" : 0,
"end_offset" : 29,
"type" : "word",
"position" : 25
}, {
"token" : "please pay for multiple wins w",
"start_offset" : 0,
"end_offset" : 30,
"type" : "word",
"position" : 26
} ]
}
然后,您的令牌过滤器将采用上面的每个令牌并完成它们的工作。例如,
- 第一个标记
pleas
会出现pleas
- 第二个令牌
please
作为please
- 第三个标记
please
(注意末尾的space),如please
- 第四个标记
please p
作为两个标记please
和p
- 第五个令牌
please pa
作为两个令牌please
和pa
- 等等
因此,您的 left_tokenizer
将整个句子视为单个标记输入并将其从 5 个字符标记化为 30 个字符,这就是为什么它在 wins
处停止(回答问题 3)
正如您在上面看到的,一些标记是重复的,因为 word_delimiter
标记过滤器隔离地处理来自标记器的每个标记,因此 "duplicates"(回答问题 4)和标记更短超过 5 个字符(回答问题 1)
我认为这不是您希望它工作的方式,但从您的问题中不清楚您希望它如何工作,即您希望能够进行的搜索类型。我在这里提供的只是对您所看到内容的解释。