通过(无痛)脚本进行 ElasticSearch 排序 - 数组值被神秘地删除了重复数据
ElasticSearch sort via a (painless) script - array values are mysteriously deduped
我正在 PUT
在 ElasticSearch 中搜索以下文档:
{
"_rootId": "327d3aba-4f7c-4abb-9ff3-b1608c354c7c",
"_docId": "ID_3",
"_ver": 0,
"val_labels": [
"x1",
"x1",
"x1"
]
}
然后,我 GET
以下查询使用 painless
脚本进行排序:
{
"query": {
"bool": {
"must": [
{
"term": {
"_rootId": "77394e08-32be-4611-bbf7-818dfe4bc853"
}
}
]
}
},
"sort": [
{
"_script": {
"order": "desc",
"type": "string",
"script": {
"lang": "painless",
"source": "return doc['val_labels'].toString()"
}
}
}
]
}
这是我收到的回复:
{
"took": 30,
"timed_out": false,
"_shards": {
"total": 12,
"successful": 12,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": null,
"hits": [
{
"_index": "my-index",
"_type": "views",
"_id": "77394e08-32be-4611-bbf7-818dfe4bc853.ID_3",
"_score": null,
"_source": {
"_rootId": "77394e08-32be-4611-bbf7-818dfe4bc853",
"_docId": "ID_3",
"_ver": 0,
"val_labels": [
"x1",
"x1",
"x1"
]
},
"sort": [
"[x1]"
]
}
]
}
}
奇怪的是,响应中的 val_labels
字段显示 ["x1", "x1", "x1"]
(正如预期的那样,请参阅插入的对象),而 sort
字段仅显示一个 x1
值。
有什么解释吗?
结果中的 字段 _source
是原始未修改的文档,而排序脚本正在访问 doc values doc['val_labels']
已处理的字段。这可以通过显式获取 docvalue_fields
来调试:
{
"query": {
"match_all": {}
},
"docvalue_fields": [
"val_labels"
]
}
产生以下命中(我只索引了一个文档)
{
"hits": [
{
"_index": "test",
"_type": "_doc",
"_id": "ID_3",
"_score": 1,
"_source": {
"val_labels": [
"x1",
"x1",
"x1"
]
},
"fields": {
"val_labels": [
"x1"
]
}
}
]
}
注意结果中的去重值。这是因为多个相同的值导致词频增加
GET /test/_doc/ID_3/_termvectors?fields=val_labels
{
"term_vectors": {
"val_labels": {
"field_statistics": {
"sum_doc_freq": 1,
"doc_count": 1,
"sum_ttf": -1
},
"terms": {
"x1": {
"term_freq": 3,
"tokens": [
{
"position": 0,
"start_offset": 0,
"end_offset": 2
},
{
"position": 1,
"start_offset": 3,
"end_offset": 5
},
{
"position": 2,
"start_offset": 6,
"end_offset": 8
}
]
}
}
}
}
}
我正在 PUT
在 ElasticSearch 中搜索以下文档:
{
"_rootId": "327d3aba-4f7c-4abb-9ff3-b1608c354c7c",
"_docId": "ID_3",
"_ver": 0,
"val_labels": [
"x1",
"x1",
"x1"
]
}
然后,我 GET
以下查询使用 painless
脚本进行排序:
{
"query": {
"bool": {
"must": [
{
"term": {
"_rootId": "77394e08-32be-4611-bbf7-818dfe4bc853"
}
}
]
}
},
"sort": [
{
"_script": {
"order": "desc",
"type": "string",
"script": {
"lang": "painless",
"source": "return doc['val_labels'].toString()"
}
}
}
]
}
这是我收到的回复:
{
"took": 30,
"timed_out": false,
"_shards": {
"total": 12,
"successful": 12,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": null,
"hits": [
{
"_index": "my-index",
"_type": "views",
"_id": "77394e08-32be-4611-bbf7-818dfe4bc853.ID_3",
"_score": null,
"_source": {
"_rootId": "77394e08-32be-4611-bbf7-818dfe4bc853",
"_docId": "ID_3",
"_ver": 0,
"val_labels": [
"x1",
"x1",
"x1"
]
},
"sort": [
"[x1]"
]
}
]
}
}
奇怪的是,响应中的 val_labels
字段显示 ["x1", "x1", "x1"]
(正如预期的那样,请参阅插入的对象),而 sort
字段仅显示一个 x1
值。
有什么解释吗?
字段 _source
是原始未修改的文档,而排序脚本正在访问 doc values doc['val_labels']
已处理的字段。这可以通过显式获取 docvalue_fields
来调试:
{
"query": {
"match_all": {}
},
"docvalue_fields": [
"val_labels"
]
}
产生以下命中(我只索引了一个文档)
{
"hits": [
{
"_index": "test",
"_type": "_doc",
"_id": "ID_3",
"_score": 1,
"_source": {
"val_labels": [
"x1",
"x1",
"x1"
]
},
"fields": {
"val_labels": [
"x1"
]
}
}
]
}
注意结果中的去重值。这是因为多个相同的值导致词频增加
GET /test/_doc/ID_3/_termvectors?fields=val_labels
{
"term_vectors": {
"val_labels": {
"field_statistics": {
"sum_doc_freq": 1,
"doc_count": 1,
"sum_ttf": -1
},
"terms": {
"x1": {
"term_freq": 3,
"tokens": [
{
"position": 0,
"start_offset": 0,
"end_offset": 2
},
{
"position": 1,
"start_offset": 3,
"end_offset": 5
},
{
"position": 2,
"start_offset": 6,
"end_offset": 8
}
]
}
}
}
}
}