ElasticSearch 7 索引与 ElasticSearch 5 相比太大了

ElasticSearch 7 Index way too big vs ElasticSearch 5

我们正在从 ElasticSearch 5.6 迁移到 7.9。在 5.6 上,我们有 2 个索引,一个包含 3.4k 个文档,占 111.2 MB,另一个包含 81.6k 个文档,占 845.6 MB。在 7.9 上,我们有相同的 2 个索引(由相同的进程写入)和相似的映射,但是它分别使用 14.3 GB 和 15.6 GB。

我不明白是什么让这些指数在 7.9 和 5.6 上如此之大。

如果你很好奇,下面是映射(我混淆了许多字段的名称以保护我们的数据): ES 5.6

{
  "blah-state-37c088aea98d4b60ad58fb04abe55aa7": {
    "mappings": {
      "blahblah": {
        "properties": {
          "blahStatus": {
            "type": "text",
            "fields": {
              "keyword": {
                "type": "keyword",
                "ignore_above": 256
              }
            }
          },
          "blah": {
            "type": "integer"
          },
          "blahblah": {
            "type": "long"
          },
          "blahblahblah": {
            "type": "text"
          },
          "blahblahblahblah": {
            "type": "integer"
          },
          "blahblahblahzzz": {
            "type": "text",
            "fields": {
              "keyword": {
                "type": "keyword"
              }
            },
            "analyzer": "ngram_suggest"
          },
          "blahblahblahhh": {
            "type": "text",
            "index": false,
            "store": true
          },
          "blahblahblaaaa": {
            "type": "keyword"
          },
          "created": {
            "type": "text"
          },
          "ended": {
            "type": "text"
          },
          "blaaaaah": {
            "type": "boolean"
          },
          "blaahaah": {
            "type": "integer"
          },
          "bloop": {
            "type": "boolean"
          },
          "bloopibob": {
            "type": "integer"
          },
          "blabiba": {
            "type": "keyword"
          },
          "blah": {
            "type": "long"
          },
          "bleeeep": {
            "type": "boolean"
          },
          "blahhh": {
            "type": "boolean"
          },
          "blahah": {
            "type": "text"
          },
          "hidden": {
            "type": "boolean"
          },
          "blah1": {
            "type": "boolean"
          },
          "blah2": {
            "type": "boolean"
          },
          "blah3": {
            "type": "boolean"
          },
          "blah4": {
            "type": "boolean"
          },
          "blah5": {
            "type": "boolean"
          },
          "blah6": {
            "type": "boolean"
          },
          "blah7": {
            "type": "boolean"
          },
          "blah8": {
            "type": "boolean"
          },
          "blah9": {
            "type": "boolean"
          },
          "blah10": {
            "type": "boolean"
          },
          "blah11": {
            "type": "boolean"
          },
          "blah12": {
            "type": "boolean"
          },
          "blah13": {
            "type": "boolean"
          },
          "isInvalid": {
            "type": "boolean"
          },
          "blah14": {
            "type": "boolean"
          },
          "isNew": {
            "type": "boolean"
          },
          "blah15": {
            "type": "boolean"
          },
          "keywords": {
            "type": "text",
            "fields": {
              "keyword": {
                "type": "keyword"
              }
            },
            "analyzer": "normalized"
          },
          "languages": {
            "type": "keyword"
          },
          "blah16": {
            "type": "integer"
          },
          "blah17": {
            "type": "integer"
          },
          "blah18": {
            "type": "keyword"
          },
          "maxWait": {
            "type": "integer"
          },
          "minBuyIn": {
            "type": "float"
          },
          "nickname": {
            "type": "text",
            "fields": {
              "raw": {
                "type": "keyword"
              }
            },
            "analyzer": "normalized"
          },
          "nicknamePartial": {
            "type": "text",
            "analyzer": "ngram_partial"
          },
          "nicknameSuggest": {
            "type": "text",
            "analyzer": "ngram_suggest"
          },
          "blah19": {
            "type": "text"
          },
          "blah20": {
            "type": "boolean"
          },
          "DocumentID": {
            "type": "keyword"
          },
          "pledgedAmt": {
            "type": "float"
          },
          "preferredLanguage": {
            "type": "text"
          },
          "blah21": {
            "type": "integer"
          },
          "blah22": {
            "type": "integer"
          },
          "rating": {
            "type": "integer"
          },
          "region": {
            "type": "keyword"
          },
          "requestedAmt": {
            "type": "float"
          },
          "showInFreeAreas": {
            "type": "boolean"
          },
          "blah23": {
            "type": "boolean"
          },
          "blah24": {
            "type": "text"
          },
          "blah25": {
            "type": "scaled_float",
            "scaling_factor": 100000
          },
          "sortScore": {
            "type": "long"
          },
          "started": {
            "type": "text"
          },
          "statusKey": {
            "type": "text"
          },
          "blah26": {
            "type": "long"
          },
          "blah27": {
            "type": "text",
            "fields": {
              "keyword": {
                "type": "keyword",
                "ignore_above": 256
              }
            }
          },
          "tagName": {
            "type": "text",
            "fields": {
              "keyword": {
                "type": "keyword"
              }
            },
            "analyzer": "normalized"
          },
          "tagNameRaw": {
            "type": "text",
            "fields": {
              "keyword": {
                "type": "keyword"
              }
            },
            "analyzer": "normalized"
          },
          "tagNameSuggest": {
            "type": "text",
            "fields": {
              "keyword": {
                "type": "keyword"
              }
            },
            "analyzer": "ngram_suggest"
          },
          "blah28": {
            "type": "boolean"
          },
          "traceId": {
            "type": "object",
            "enabled": false
          },
          "updated": {
            "type": "long"
          },
          "blah29": {
            "type": "boolean"
          }
        }
      }
    }
  }
}

和 7.9

{
  "blah-state-37c088aea98d4b60ad58fb04abe55aa7" : {
    "mappings" : {
      "properties" : {
        "accountStatus" : {
          "type" : "keyword"
        },
        "boost" : {
          "type" : "integer"
        },
        "age" : {
          "type" : "integer"
        },
        "bleeeeeep" : {
          "type" : "keyword"
        },
        "bleeeep" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          },
          "analyzer" : "ngram_suggest"
        },
        "bleeep" : {
          "type" : "keyword"
        },
        "bleep" : {
          "type" : "keyword"
        },
        "blah0" : {
          "type" : "boolean"
        },
        "blah1" : {
          "type" : "boolean"
        },
        "blah2" : {
          "type" : "text"
        },
        "hidden" : {
          "type" : "boolean"
        },
        "blah3" : {
          "type" : "boolean"
        },
        "blah4" : {
          "type" : "boolean"
        },
        "blah5" : {
          "type" : "boolean"
        },
        "blah6" : {
          "type" : "boolean"
        },
        "blah7" : {
          "type" : "boolean"
        },
        "blah8" : {
          "type" : "boolean"
        },
        "blah9" : {
          "type" : "boolean"
        },
        "blah10" : {
          "type" : "boolean"
        },
        "blah11" : {
          "type" : "boolean"
        },
        "blah12" : {
          "type" : "boolean"
        },
        "blah13" : {
          "type" : "boolean"
        },
        "blah14" : {
          "type" : "boolean"
        },
        "blah15" : {
          "type" : "boolean"
        },
        "blah16" : {
          "type" : "boolean"
        },
        "isNew" : {
          "type" : "boolean"
        },
        "blah17" : {
          "type" : "boolean"
        },
        "keywords" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          },
          "analyzer" : "normalized"
        },
        "languages" : {
          "type" : "keyword"
        },
        "blah18" : {
          "type" : "integer"
        },
        "blah19" : {
          "type" : "integer"
        },
        "nickname" : {
          "type" : "text",
          "fields" : {
            "raw" : {
              "type" : "keyword"
            }
          },
          "analyzer" : "normalized"
        },
        "nicknamePartial" : {
          "type" : "text",
          "analyzer" : "ngram_partial"
        },
        "nicknameSuggest" : {
          "type" : "text",
          "analyzer" : "ngram_suggest"
        },
        "blah20" : {
          "type" : "boolean"
        },
        "blah21" : {
          "type" : "boolean"
        },
        "DocumentId" : {
          "type" : "keyword"
        },
        "preferredLanguage" : {
          "type" : "keyword"
        },
        "rating" : {
          "type" : "integer"
        },
        "region" : {
          "type" : "keyword"
        },
        "blah22" : {
          "type" : "boolean"
        },
        "blah23" : {
          "type" : "boolean"
        },
        "blah24" : {
          "type" : "scaled_float",
          "scaling_factor" : 100000.0
        },
        "sortScore" : {
          "type" : "integer"
        },
        "blah25" : {
          "type" : "keyword"
        },
        "tagName" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          },
          "analyzer" : "normalized"
        },
        "tagNameRaw" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          },
          "analyzer" : "normalized"
        },
        "tagNameSuggest" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          },
          "analyzer" : "ngram_suggest"
        },
        "blah26" : {
          "type" : "boolean"
        },
        "traceId" : {
          "type" : "object",
          "enabled" : false
        },
        "updated" : {
          "type" : "long"
        },
        "blah27" : {
          "type" : "boolean"
        }
      } 
    }
  }
}

编辑:这里是设置: 5.6:

"settings": {
      "index": {
        "analysis": {
          "filter": {
            "english_stemmer": {
              "type": "stemmer",
              "language": "english"
            }
          },
          "analyzer": {
            "ngram_partial": {
              "filter": [
                "standard",
                "asciifolding",
                "lowercase"
              ],
              "tokenizer": "ngram"
            },
            "ngram_suggest": {
              "filter": [
                "standard",
                "asciifolding",
                "lowercase"
              ],
              "tokenizer": "edge_ngram"
            },
            "normalized": {
              "filter": [
                "standard",
                "asciifolding",
                "lowercase",
                "english_stemmer"
              ],
              "type": "custom",
              "tokenizer": "standard"
            }
          },
          "tokenizer": {
            "edge_ngram": {
              "token_chars": [
                "letter",
                "digit",
                "punctuation"
              ],
              "min_gram": "1",
              "type": "edge_ngram",
              "max_gram": "20"
            },
            "ngram": {
              "token_chars": [
                "letter",
                "digit",
                "punctuation"
              ],
              "min_gram": "2",
              "type": "ngram",
              "max_gram": "20"
            }
          }
        },
        "number_of_shards": "12"
      }
    }

和 7.9:

"settings" : {
      "index" : {
        "analysis" : {
          "filter" : {
            "english_stemmer" : {
              "type" : "stemmer",
              "language" : "english"
            }
          },
          "analyzer" : {
            "ngram_partial" : {
              "filter" : [
                "asciifolding",
                "lowercase"
              ],
              "tokenizer" : "ngram"
            },
            "ngram_suggest" : {
              "filter" : [
                "asciifolding",
                "lowercase"
              ],
              "tokenizer" : "edge_ngram"
            },
            "normalized" : {
              "filter" : [
                "asciifolding",
                "lowercase",
                "english_stemmer"
              ],
              "type" : "custom",
              "tokenizer" : "standard"
            }
          },
          "tokenizer" : {
            "edge_ngram" : {
              "token_chars" : [
                "letter",
                "digit",
                "punctuation"
              ],
              "min_gram" : "1",
              "type" : "edge_ngram",
              "max_gram" : "20"
            },
            "ngram" : {
              "token_chars" : [
                "letter",
                "digit",
                "punctuation"
              ],
              "min_gram" : "3",
              "type" : "ngram",
              "max_gram" : "3"
            }
          }
        },
        "number_of_shards" : "12"
      }
    }

_cat/shards 的结果 5.6:

redacted-state-37c088aea98d4b60ad58fb04abe55aa7 1     p      STARTED  960   8mb 000.00.000.84 host5
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 1     r      STARTED  960   8mb 000.00.000.89 host10
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 1     r      STARTED  960 8.1mb 000.00.000.80 host1
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 1     r      STARTED  960 7.7mb 000.00.000.86 host7
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 2     r      STARTED  978 9.2mb 000.00.000.90 host11
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 2     r      STARTED  978 8.9mb 000.00.000.81 host2
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 2     p      STARTED  978 8.7mb 000.00.000.87 host8
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 2     r      STARTED  978 8.6mb 000.00.000.83 host4
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 0     p      STARTED  990 8.1mb 000.00.000.85 host6
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 0     r      STARTED  990 7.6mb 000.00.000.91 host12
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 0     r      STARTED  990 8.5mb 000.00.000.88 host9
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 0     r      STARTED  990 7.9mb 000.00.000.82 host3

和 7.9:

redacted-state-37c088aea98d4b60ad58fb04abe55aa7 8     p      STARTED  262 673.4mb 000.00.000.126 host12
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 8     r      STARTED  286 667.8mb 000.00.000.124 host10
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 9     p      STARTED  278 754.9mb 000.00.000.124 host10
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 9     r      STARTED  196 729.7mb 000.00.000.123 host9
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 7     p      STARTED  247 654.2mb 000.00.000.119 host5
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 7     r      STARTED  262 645.1mb 000.00.000.126 host12
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 4     p      STARTED  225 719.8mb 000.00.000.121 host7
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 4     r      STARTED  282 660.9mb 000.00.000.122 host8
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 6     p      STARTED  274 715.6mb 000.00.000.125 host11
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 6     r      STARTED  334 706.3mb 000.00.000.119 host5
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 11    r      STARTED  194 691.6mb 000.00.000.120 host6
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 11    p      STARTED  255 713.1mb 000.00.000.115 host1
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 3     p      STARTED  212 716.6mb 000.00.000.117 host3
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 3     r      STARTED  292 709.3mb 000.00.000.121 host7
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 1     p      STARTED  249 749.5mb 000.00.000.118 host4
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 1     r      STARTED  289 695.5mb 000.00.000.116 host2
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 5     p      STARTED  243 701.4mb 000.00.000.122 host8
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 5     r      STARTED  204 680.9mb 000.00.000.125 host11
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 2     p      STARTED  246 685.8mb 000.00.000.116 host2
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 2     r      STARTED  305 676.7mb 000.00.000.117 host3
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 10    p      STARTED  235 701.2mb 000.00.000.123 host9
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 10    r      STARTED  276 690.5mb 000.00.000.115 host1
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 0     p      STARTED  245 674.7mb 000.00.000.120 host6
redacted-state-37c088aea98d4b60ad58fb04abe55aa7 0     r      STARTED  301 623.5mb 000.00.000.118 host4

经过多次试验,我们确定这是 soft deletes 的原因。不幸的是,禁用软删除已被弃用,因此这将成为我们未来的一个问题。