10 亿份文件的 Couchbase N1QL 索引

Question

我有以下查询，我在 Couchbase 企业（6.0.2 build 2413）中运行针对大约 10 亿个文档。基于此查询创建的性能最高的索引是什么？（希望在特定时间段内完成报告，因此以索引的最快速度为主要目标）

select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount
from (
    select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID
    from `LogBucket` a
    where LoggingType in [3001, 4004, 6002]
    group by LogFileID, RowKey) as a
group by a.LoggingType, a.LogJobID

我尝试创建了以下索引：

CREATE INDEX `data_job_productivity_index1` 
ON `LogBucket`(`LogFileID`,`RowKey`,`LoggingType`,`LogJobID`,`CreateDate`,`SequenceID`) 
PARTITION BY hash((meta().`id`)) WHERE (`LoggingType` in [3001, 4004, 6002])

但是当我检查 explain 时它使用了不同的索引（一个专用于不同的报告查询）。

{
  "plan": {
    "#operator": "Sequence",
    "~children": [
      {
        "#operator": "Sequence",
        "~children": [
          {
            "#operator": "IndexScan3",
            "as": "a",
            "index": "analyst_log_LogJob_activity",
            "index_id": "f85999b9b7cc0d3f",
            "index_projection": {
              "primary_key": true
            },
            "keyspace": "LogBucket",
            "namespace": "default",
            "spans": [
              {
                "exact": true,
                "range": [
                  {
                    "high": "3001",
                    "inclusion": 3,
                    "low": "3001"
                  }
                ]
              },
              {
                "exact": true,
                "range": [
                  {
                    "high": "4004",
                    "inclusion": 3,
                    "low": "4004"
                  }
                ]
              },
              {
                "exact": true,
                "range": [
                  {
                    "high": "6002",
                    "inclusion": 3,
                    "low": "6002"
                  }
                ]
              }
            ],
            "using": "gsi"
          },
          {
            "#operator": "Fetch",
            "as": "a",
            "keyspace": "LogBucket",
            "namespace": "default"
          },
          {
            "#operator": "Parallel",
            "~child": {
              "#operator": "Sequence",
              "~children": [
                {
                  "#operator": "Filter",
                  "condition": "((`a`.`LoggingType`) in [3001, 4004, 6002])"
                },
                {
                  "#operator": "InitialGroup",
                  "aggregates": [
                    "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
                  ],
                  "group_keys": [
                    "(`a`.`LogFileID`)",
                    "(`a`.`RowKey`)"
                  ]
                }
              ]
            }
          },
          {
            "#operator": "IntermediateGroup",
            "aggregates": [
              "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
            ],
            "group_keys": [
              "(`a`.`LogFileID`)",
              "(`a`.`RowKey`)"
            ]
          },
          {
            "#operator": "FinalGroup",
            "aggregates": [
              "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
            ],
            "group_keys": [
              "(`a`.`LogFileID`)",
              "(`a`.`RowKey`)"
            ]
          },
          {
            "#operator": "Parallel",
            "~child": {
              "#operator": "Sequence",
              "~children": [
                {
                  "#operator": "InitialProject",
                  "result_terms": [
                    {
                      "expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LoggingType`)"
                    },
                    {
                      "expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LogJobID`)"
                    }
                  ]
                },
                {
                  "#operator": "FinalProject"
                }
              ]
            }
          }
        ]
      },
      {
        "#operator": "Alias",
        "as": "a"
      },
      {
        "#operator": "Parallel",
        "~child": {
          "#operator": "Sequence",
          "~children": [
            {
              "#operator": "InitialGroup",
              "aggregates": [
                "count(*)"
              ],
              "group_keys": [
                "(`a`.`LoggingType`)",
                "(`a`.`LogJobID`)"
              ]
            }
          ]
        }
      },
      {
        "#operator": "IntermediateGroup",
        "aggregates": [
          "count(*)"
        ],
        "group_keys": [
          "(`a`.`LoggingType`)",
          "(`a`.`LogJobID`)"
        ]
      },
      {
        "#operator": "FinalGroup",
        "aggregates": [
          "count(*)"
        ],
        "group_keys": [
          "(`a`.`LoggingType`)",
          "(`a`.`LogJobID`)"
        ]
      },
      {
        "#operator": "Parallel",
        "~child": {
          "#operator": "Sequence",
          "~children": [
            {
              "#operator": "InitialProject",
              "result_terms": [
                {
                  "expr": "(`a`.`LogJobID`)"
                },
                {
                  "as": "LoggingTypeID",
                  "expr": "(`a`.`LoggingType`)"
                },
                {
                  "as": "AffectedLineCount",
                  "expr": "count(*)"
                }
              ]
            },
            {
              "#operator": "FinalProject"
            }
          ]
        }
      }
    ]
  },
  "text": "select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount\nfrom (\n    select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID\n    from `LogBucket` a\n    where LoggingType in [3001, 4004, 6002]\n    group by LogFileID, RowKey) as a\ngroup by a.LoggingType, a.LogJobID"
}

它选择使用的索引是这样创建的：

CREATE INDEX `analyst_log_LogJob_activity` ON `LogBucket`(`LoggingType`,`LogJobID`) PARTITION BY hash((meta().`id`))

第二个索引的问题在于，它包含索引下的所有 10 亿个文档，而我尝试 create/dedicate 为这份新报告创建的文档将明显减少，因为 LoggingType where 子句。

Answer 1

您可以按如下方式创建覆盖索引。仅当所有查询使用相同的 LoggingType 值时才使用索引 WHERE 子句。

CREATE INDEX `data_job_productivity_index1` ON `LogBucket`
           (`LoggingType`, `LogFileID`,`RowKey`,`CreateDate`,`SequenceID`, `LogJobID`)
PARTITION BY HASH(META().`id`) WHERE LoggingType IN [3001, 4004, 6002];

SELECT LogJobID, LoggingTypeID, COUNT(1) AS AffectedLineCount
FROM (
    SELECT MAX([CreateDate, SequenceID, {LoggingTypeID:LoggingType,LogJobID} ])[2].*
    FROM `LogBucket` AS a
    WHERE LoggingType IN [3001, 4004, 6002]
    GROUP BY LogFileID, RowKey) AS a
GROUP BY LoggingTypeID, LogJobID;

确保覆盖内部子查询并使用索引聚合 https://blog.couchbase.com/understanding-index-grouping-aggregation-couchbase-n1ql-query/
探索索引复制以获得高可用性和性能 https://docs.couchbase.com/server/current/learn/services-and-indexes/indexes/index-replication.html
如果 LoggingType、LogFileID、RowKey 不可变，则将它们作为分区键进行探索而不是 META().id https://blog.couchbase.com/couchbase-gsi-index-partitioning/
https://blog.couchbase.com/create-right-index-get-right-performance/
探索指数顾问https://index-advisor.couchbase.com

10 亿份文件的 Couchbase N1QL 索引

Couchbase N1QL Index for 1 billion documents

couchbase

n1ql