ElasticSearch 术语聚合不适用于自定义分析器和模式分词器
ElasticSearch Terms Aggregation not working with custom Analyzer and Pattern Tokenizer
我是第一次尝试术语聚合,我使用的自定义模式分词器似乎有问题。
这是映射:
{
"mappings": {
"properties": {
"contentItemType": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "patternAnalyzer"
},
"theme": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "patternAnalyzer"
}
}
},
"settings": {
"analysis": {
"analyzer": {
"patternAnalyzer": {
"tokenizer": "patternTokenizer"
}
},
"tokenizer": {
"patternTokenizer": {
"type": "pattern",
"pattern": ";"
}
}
}
}
}
当我尝试使用聚合 API http://my_server/index_name/_search 进行搜索时,结果如下:
{
"aggregations": {
"group_by_contentItemType": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Correspondence; Reports",
"doc_count": 3
},
{
"key": "Correspondence",
"doc_count": 2
},
{
"key": "Meeting Minutes; Administrative Records; Reports",
"doc_count": 2
},
{
"key": "Correspondence; Legal and Treaty Material; Reports",
"doc_count": 1
},
{
"key": "Correspondence; Memoranda",
"doc_count": 1
},
{
"key": "Memoranda",
"doc_count": 1
},
{
"key": "Reports",
"doc_count": 1
}
]
},
"group_by_theme": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "International Relations",
"doc_count": 2
},
{
"key": "Key Events; Dissent; Dissent; Resistance; Human Rights",
"doc_count": 2
},
{
"key": "Border Security and Migration; Key Events",
"doc_count": 1
},
{
"key": "Border Security and Migration; Second World War Aftermath",
"doc_count": 1
},
{
"key": "Domestic Politics",
"doc_count": 1
},
{
"key": "Domestic Politics; Border Security and Migration",
"doc_count": 1
},
{
"key": "Economics and Trade; International Relations",
"doc_count": 1
},
{
"key": "Embassy and Consulate Administration; Industry and Agriculture; International Relations",
"doc_count": 1
},
{
"key": "Populations and Social Policy; Second World War Aftermath; International Relations",
"doc_count": 1
}
]
}
}
}
如您所见,聚合存在问题。我已经在这个问题上停留了好几天了。我看过很多例子,但仍然无法解决这个问题。
请帮忙。提前致谢!!!
编辑!!!
这是@CatalinM 回答后的完整映射:
{
"local_cwee": {
"mappings": {
"dynamic": "false",
"properties": {
"author": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"commentaries": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"contentDateEndMonth": {
"type": "integer"
},
"contentDateEndSpecified": {
"type": "boolean"
},
"contentDateEndYear": {
"type": "integer"
},
"contentDateMonth": {
"type": "integer"
},
"contentDateMonthSpecified": {
"type": "boolean"
},
"contentDateStartMonth": {
"type": "integer"
},
"contentDateStartSpecified": {
"type": "boolean"
},
"contentDateStartYear": {
"type": "integer"
},
"contentDateYear": {
"type": "integer"
},
"contentDoi": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"contentItemType": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"contentItemTypeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"contentTitle": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"copyrightNotices": {
"type": "nested",
"properties": {
"imageName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"text": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"countries": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"country": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"coverDateEndMonth": {
"type": "integer"
},
"coverDateEndSpecified": {
"type": "boolean"
},
"coverDateEndYear": {
"type": "integer"
},
"coverDateMonth": {
"type": "integer"
},
"coverDateMonthSpecified": {
"type": "boolean"
},
"coverDateStartMonth": {
"type": "integer"
},
"coverDateStartSpecified": {
"type": "boolean"
},
"coverDateStartYear": {
"type": "integer"
},
"coverDateYear": {
"type": "integer"
},
"displayName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"documentDoi": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"documentLevel": {
"type": "integer"
},
"keyEvents": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"language": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"languageFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"languages": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"languagesFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"moduleNumber": {
"type": "integer"
},
"notes": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"pageTranscript": {
"type": "text",
"term_vector": "with_positions",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "whiteSpaceAnalyzer"
},
"people": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publicationDate": {
"type": "integer"
},
"publicationDateEndMonth": {
"type": "integer"
},
"publicationDateEndSpecified": {
"type": "boolean"
},
"publicationDateEndYear": {
"type": "integer"
},
"publicationDateMonth": {
"type": "integer"
},
"publicationDateMonthSpecified": {
"type": "boolean"
},
"publicationDateStartMonth": {
"type": "integer"
},
"publicationDateStartSpecified": {
"type": "boolean"
},
"publicationDateStartYear": {
"type": "integer"
},
"publicationDateYear": {
"type": "integer"
},
"publicationDoi": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publicationId": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationIdFacet": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationTitle": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publicationType": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationTypeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationYear": {
"type": "integer"
},
"publisherName": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publisherNameFacet": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
}
"subject": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectAreas": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectAreasFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectCountries": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectCountriesFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectKeyword": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectKeywordFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subthemeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subthemes": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"theme": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"themeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"themes": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
}
}
}
}
}
使用您的自定义分词器,文本字段中的分词为 "Correspondence"、"Meeting Minutes"、"Administrative Records"、..等。所以我认为您不需要关键字字段。
要使聚合在文本字段上起作用,您必须在映射中添加 "fielddata": true
。这是默认禁用的,因为不需要对大文本字段进行聚合,但在您的情况下,标记正是您要聚合的值。
这里是简化配置
{
"mappings": {
"properties": {
"contentItemType": {
"type": "text",
"fielddata": true,
"analyzer": "patternAnalyzer"
}
}
},
"settings": {
"analysis": {
"analyzer": {
"patternAnalyzer": {
"tokenizer": "patternTokenizer"
}
},
"tokenizer": {
"patternTokenizer": {
"type": "pattern",
"pattern": ";"
}
}
}
}
}
查询:
{
"aggregations" : {
"test" : {
"terms" : { "field" : "contentItemType" }
}
}
}
结果:
"aggregations": {
"test": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": " Administrative Records",
"doc_count": 1
},
{
"key": "Meeting Minutes",
"doc_count": 1
},
{
"key": " Reports",
"doc_count": 1
}
]
}
}
我是第一次尝试术语聚合,我使用的自定义模式分词器似乎有问题。
这是映射:
{
"mappings": {
"properties": {
"contentItemType": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "patternAnalyzer"
},
"theme": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "patternAnalyzer"
}
}
},
"settings": {
"analysis": {
"analyzer": {
"patternAnalyzer": {
"tokenizer": "patternTokenizer"
}
},
"tokenizer": {
"patternTokenizer": {
"type": "pattern",
"pattern": ";"
}
}
}
}
}
当我尝试使用聚合 API http://my_server/index_name/_search 进行搜索时,结果如下:
{
"aggregations": {
"group_by_contentItemType": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Correspondence; Reports",
"doc_count": 3
},
{
"key": "Correspondence",
"doc_count": 2
},
{
"key": "Meeting Minutes; Administrative Records; Reports",
"doc_count": 2
},
{
"key": "Correspondence; Legal and Treaty Material; Reports",
"doc_count": 1
},
{
"key": "Correspondence; Memoranda",
"doc_count": 1
},
{
"key": "Memoranda",
"doc_count": 1
},
{
"key": "Reports",
"doc_count": 1
}
]
},
"group_by_theme": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "International Relations",
"doc_count": 2
},
{
"key": "Key Events; Dissent; Dissent; Resistance; Human Rights",
"doc_count": 2
},
{
"key": "Border Security and Migration; Key Events",
"doc_count": 1
},
{
"key": "Border Security and Migration; Second World War Aftermath",
"doc_count": 1
},
{
"key": "Domestic Politics",
"doc_count": 1
},
{
"key": "Domestic Politics; Border Security and Migration",
"doc_count": 1
},
{
"key": "Economics and Trade; International Relations",
"doc_count": 1
},
{
"key": "Embassy and Consulate Administration; Industry and Agriculture; International Relations",
"doc_count": 1
},
{
"key": "Populations and Social Policy; Second World War Aftermath; International Relations",
"doc_count": 1
}
]
}
}
}
如您所见,聚合存在问题。我已经在这个问题上停留了好几天了。我看过很多例子,但仍然无法解决这个问题。 请帮忙。提前致谢!!!
编辑!!! 这是@CatalinM 回答后的完整映射:
{
"local_cwee": {
"mappings": {
"dynamic": "false",
"properties": {
"author": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"commentaries": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"contentDateEndMonth": {
"type": "integer"
},
"contentDateEndSpecified": {
"type": "boolean"
},
"contentDateEndYear": {
"type": "integer"
},
"contentDateMonth": {
"type": "integer"
},
"contentDateMonthSpecified": {
"type": "boolean"
},
"contentDateStartMonth": {
"type": "integer"
},
"contentDateStartSpecified": {
"type": "boolean"
},
"contentDateStartYear": {
"type": "integer"
},
"contentDateYear": {
"type": "integer"
},
"contentDoi": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"contentItemType": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"contentItemTypeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"contentTitle": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"copyrightNotices": {
"type": "nested",
"properties": {
"imageName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"text": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"countries": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"country": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"coverDateEndMonth": {
"type": "integer"
},
"coverDateEndSpecified": {
"type": "boolean"
},
"coverDateEndYear": {
"type": "integer"
},
"coverDateMonth": {
"type": "integer"
},
"coverDateMonthSpecified": {
"type": "boolean"
},
"coverDateStartMonth": {
"type": "integer"
},
"coverDateStartSpecified": {
"type": "boolean"
},
"coverDateStartYear": {
"type": "integer"
},
"coverDateYear": {
"type": "integer"
},
"displayName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"documentDoi": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"documentLevel": {
"type": "integer"
},
"keyEvents": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"language": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"languageFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"languages": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"languagesFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"moduleNumber": {
"type": "integer"
},
"notes": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"pageTranscript": {
"type": "text",
"term_vector": "with_positions",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "whiteSpaceAnalyzer"
},
"people": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publicationDate": {
"type": "integer"
},
"publicationDateEndMonth": {
"type": "integer"
},
"publicationDateEndSpecified": {
"type": "boolean"
},
"publicationDateEndYear": {
"type": "integer"
},
"publicationDateMonth": {
"type": "integer"
},
"publicationDateMonthSpecified": {
"type": "boolean"
},
"publicationDateStartMonth": {
"type": "integer"
},
"publicationDateStartSpecified": {
"type": "boolean"
},
"publicationDateStartYear": {
"type": "integer"
},
"publicationDateYear": {
"type": "integer"
},
"publicationDoi": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publicationId": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationIdFacet": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationTitle": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publicationType": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationTypeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationYear": {
"type": "integer"
},
"publisherName": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publisherNameFacet": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
}
"subject": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectAreas": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectAreasFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectCountries": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectCountriesFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectKeyword": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectKeywordFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subthemeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subthemes": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"theme": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"themeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"themes": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
}
}
}
}
}
使用您的自定义分词器,文本字段中的分词为 "Correspondence"、"Meeting Minutes"、"Administrative Records"、..等。所以我认为您不需要关键字字段。
要使聚合在文本字段上起作用,您必须在映射中添加 "fielddata": true
。这是默认禁用的,因为不需要对大文本字段进行聚合,但在您的情况下,标记正是您要聚合的值。
这里是简化配置
{
"mappings": {
"properties": {
"contentItemType": {
"type": "text",
"fielddata": true,
"analyzer": "patternAnalyzer"
}
}
},
"settings": {
"analysis": {
"analyzer": {
"patternAnalyzer": {
"tokenizer": "patternTokenizer"
}
},
"tokenizer": {
"patternTokenizer": {
"type": "pattern",
"pattern": ";"
}
}
}
}
}
查询:
{
"aggregations" : {
"test" : {
"terms" : { "field" : "contentItemType" }
}
}
}
结果:
"aggregations": {
"test": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": " Administrative Records",
"doc_count": 1
},
{
"key": "Meeting Minutes",
"doc_count": 1
},
{
"key": " Reports",
"doc_count": 1
}
]
}
}