Elasticsearch 映射:如何分析或映射到数字字段?
Elasticsearch mapping: How to analyze or map to numeric fields?
我想将 bibtex 条目的 month
字段索引到 elasticsearch 中,并使其可通过 range
查询进行搜索。这要求基础字段类型是某种数字数据类型。在我的情况下 short
就足够了。
规范形式的 bibtex month
字段需要三个字符的缩写,所以我尝试像这样使用 char_filter
:
...
"char_filter": {
"month_char_filter": {
"type": "mapping",
"mappings": [
"jan => 1",
"feb => 2",
"mar => 3",
...
"nov => 11",
"dec => 12"
]
}
...
"normalizer": {
"month_normalizer": {
"type": "custom",
"char_filter": [ "month_char_filter" ],
},
并建立这样的映射:
...
"month": {
"type": "short",
"normalizer": "month_normalizer"
},
...
但它似乎不起作用,因为 type
字段不支持这样的规范化器,也不支持分析器。
那么实现 char_filter
部分所示的映射的方法是什么,以便有范围查询的可能性?
您的方法在直觉上是有道理的,但是,规范化器只能应用于 keyword
个字段,而分析器只能应用于 text
个字段。
另一种方法是利用 ingest processors and use the script
processor 在索引时进行映射。
您可以在下面找到这样一个 script
处理器的模拟,它将根据 month
字段中出现的月份创建一个名为 monthNum
的新字段。
POST _ingest/pipeline/_simulate
{
"pipeline": {
"processors": [
{
"script": {
"source": """
def mapping = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'];
ctx.monthNum = mapping.indexOf(ctx.month) + 1;
"""
}
}
]
},
"docs": [
{
"_source": {
"month": "feb"
}
},
{
"_source": {
"month": "mar"
}
},
{
"_source": {
"month": "jul"
}
},
{
"_source": {
"month": "aug"
}
},
{
"_source": {
"month": "nov"
}
},
{
"_source": {
"month": "dec"
}
},
{
"_source": {
"month": "xyz"
}
}
]
}
生成的文档:
{
"docs" : [
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 2,
"month" : "feb"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 3,
"month" : "mar"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 7,
"month" : "jul"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 8,
"month" : "aug"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 11,
"month" : "nov"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 12,
"month" : "dec"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 0,
"month" : "xyz"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
}
]
}
我想将 bibtex 条目的 month
字段索引到 elasticsearch 中,并使其可通过 range
查询进行搜索。这要求基础字段类型是某种数字数据类型。在我的情况下 short
就足够了。
规范形式的 bibtex month
字段需要三个字符的缩写,所以我尝试像这样使用 char_filter
:
...
"char_filter": {
"month_char_filter": {
"type": "mapping",
"mappings": [
"jan => 1",
"feb => 2",
"mar => 3",
...
"nov => 11",
"dec => 12"
]
}
...
"normalizer": {
"month_normalizer": {
"type": "custom",
"char_filter": [ "month_char_filter" ],
},
并建立这样的映射:
...
"month": {
"type": "short",
"normalizer": "month_normalizer"
},
...
但它似乎不起作用,因为 type
字段不支持这样的规范化器,也不支持分析器。
那么实现 char_filter
部分所示的映射的方法是什么,以便有范围查询的可能性?
您的方法在直觉上是有道理的,但是,规范化器只能应用于 keyword
个字段,而分析器只能应用于 text
个字段。
另一种方法是利用 ingest processors and use the script
processor 在索引时进行映射。
您可以在下面找到这样一个 script
处理器的模拟,它将根据 month
字段中出现的月份创建一个名为 monthNum
的新字段。
POST _ingest/pipeline/_simulate
{
"pipeline": {
"processors": [
{
"script": {
"source": """
def mapping = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'];
ctx.monthNum = mapping.indexOf(ctx.month) + 1;
"""
}
}
]
},
"docs": [
{
"_source": {
"month": "feb"
}
},
{
"_source": {
"month": "mar"
}
},
{
"_source": {
"month": "jul"
}
},
{
"_source": {
"month": "aug"
}
},
{
"_source": {
"month": "nov"
}
},
{
"_source": {
"month": "dec"
}
},
{
"_source": {
"month": "xyz"
}
}
]
}
生成的文档:
{
"docs" : [
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 2,
"month" : "feb"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 3,
"month" : "mar"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 7,
"month" : "jul"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 8,
"month" : "aug"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 11,
"month" : "nov"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 12,
"month" : "dec"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
},
{
"doc" : {
"_index" : "_index",
"_type" : "_type",
"_id" : "_id",
"_source" : {
"monthNum" : 0,
"month" : "xyz"
},
"_ingest" : {
"timestamp" : "2019-05-08T12:28:27.006Z"
}
}
}
]
}