10 亿份文件的 Couchbase N1QL 索引
Couchbase N1QL Index for 1 billion documents
我有以下查询,我在 Couchbase 企业(6.0.2 build 2413)中 运行 针对大约 10 亿个文档。基于此查询创建的性能最高的索引是什么? (希望在特定时间段内完成报告,因此以索引的最快速度为主要目标)
select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount
from (
select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID
from `LogBucket` a
where LoggingType in [3001, 4004, 6002]
group by LogFileID, RowKey) as a
group by a.LoggingType, a.LogJobID
我尝试创建了以下索引:
CREATE INDEX `data_job_productivity_index1`
ON `LogBucket`(`LogFileID`,`RowKey`,`LoggingType`,`LogJobID`,`CreateDate`,`SequenceID`)
PARTITION BY hash((meta().`id`)) WHERE (`LoggingType` in [3001, 4004, 6002])
但是当我检查 explain 时它使用了不同的索引(一个专用于不同的报告查询)。
{
"plan": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IndexScan3",
"as": "a",
"index": "analyst_log_LogJob_activity",
"index_id": "f85999b9b7cc0d3f",
"index_projection": {
"primary_key": true
},
"keyspace": "LogBucket",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"high": "3001",
"inclusion": 3,
"low": "3001"
}
]
},
{
"exact": true,
"range": [
{
"high": "4004",
"inclusion": 3,
"low": "4004"
}
]
},
{
"exact": true,
"range": [
{
"high": "6002",
"inclusion": 3,
"low": "6002"
}
]
}
],
"using": "gsi"
},
{
"#operator": "Fetch",
"as": "a",
"keyspace": "LogBucket",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "((`a`.`LoggingType`) in [3001, 4004, 6002])"
},
{
"#operator": "InitialGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
}
]
}
},
{
"#operator": "IntermediateGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
},
{
"#operator": "FinalGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LoggingType`)"
},
{
"expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LogJobID`)"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
{
"#operator": "Alias",
"as": "a"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
}
]
}
},
{
"#operator": "IntermediateGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
},
{
"#operator": "FinalGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`a`.`LogJobID`)"
},
{
"as": "LoggingTypeID",
"expr": "(`a`.`LoggingType`)"
},
{
"as": "AffectedLineCount",
"expr": "count(*)"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
"text": "select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount\nfrom (\n select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID\n from `LogBucket` a\n where LoggingType in [3001, 4004, 6002]\n group by LogFileID, RowKey) as a\ngroup by a.LoggingType, a.LogJobID"
}
它选择使用的索引是这样创建的:
CREATE INDEX `analyst_log_LogJob_activity` ON `LogBucket`(`LoggingType`,`LogJobID`) PARTITION BY hash((meta().`id`))
第二个索引的问题在于,它包含索引下的所有 10 亿个文档,而我尝试 create/dedicate 为这份新报告创建的文档将明显减少,因为 LoggingType where 子句。
您可以按如下方式创建覆盖索引。仅当所有查询使用相同的 LoggingType 值时才使用索引 WHERE 子句。
CREATE INDEX `data_job_productivity_index1` ON `LogBucket`
(`LoggingType`, `LogFileID`,`RowKey`,`CreateDate`,`SequenceID`, `LogJobID`)
PARTITION BY HASH(META().`id`) WHERE LoggingType IN [3001, 4004, 6002];
SELECT LogJobID, LoggingTypeID, COUNT(1) AS AffectedLineCount
FROM (
SELECT MAX([CreateDate, SequenceID, {LoggingTypeID:LoggingType,LogJobID} ])[2].*
FROM `LogBucket` AS a
WHERE LoggingType IN [3001, 4004, 6002]
GROUP BY LogFileID, RowKey) AS a
GROUP BY LoggingTypeID, LogJobID;
确保覆盖内部子查询并使用索引聚合
https://blog.couchbase.com/understanding-index-grouping-aggregation-couchbase-n1ql-query/
探索索引复制以获得高可用性和性能
https://docs.couchbase.com/server/current/learn/services-and-indexes/indexes/index-replication.html
如果 LoggingType、LogFileID、RowKey 不可变,则将它们作为分区键进行探索
而不是 META().id
https://blog.couchbase.com/couchbase-gsi-index-partitioning/
https://blog.couchbase.com/create-right-index-get-right-performance/
我有以下查询,我在 Couchbase 企业(6.0.2 build 2413)中 运行 针对大约 10 亿个文档。基于此查询创建的性能最高的索引是什么? (希望在特定时间段内完成报告,因此以索引的最快速度为主要目标)
select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount
from (
select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID
from `LogBucket` a
where LoggingType in [3001, 4004, 6002]
group by LogFileID, RowKey) as a
group by a.LoggingType, a.LogJobID
我尝试创建了以下索引:
CREATE INDEX `data_job_productivity_index1`
ON `LogBucket`(`LogFileID`,`RowKey`,`LoggingType`,`LogJobID`,`CreateDate`,`SequenceID`)
PARTITION BY hash((meta().`id`)) WHERE (`LoggingType` in [3001, 4004, 6002])
但是当我检查 explain 时它使用了不同的索引(一个专用于不同的报告查询)。
{
"plan": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IndexScan3",
"as": "a",
"index": "analyst_log_LogJob_activity",
"index_id": "f85999b9b7cc0d3f",
"index_projection": {
"primary_key": true
},
"keyspace": "LogBucket",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"high": "3001",
"inclusion": 3,
"low": "3001"
}
]
},
{
"exact": true,
"range": [
{
"high": "4004",
"inclusion": 3,
"low": "4004"
}
]
},
{
"exact": true,
"range": [
{
"high": "6002",
"inclusion": 3,
"low": "6002"
}
]
}
],
"using": "gsi"
},
{
"#operator": "Fetch",
"as": "a",
"keyspace": "LogBucket",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "((`a`.`LoggingType`) in [3001, 4004, 6002])"
},
{
"#operator": "InitialGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
}
]
}
},
{
"#operator": "IntermediateGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
},
{
"#operator": "FinalGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LoggingType`)"
},
{
"expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LogJobID`)"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
{
"#operator": "Alias",
"as": "a"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
}
]
}
},
{
"#operator": "IntermediateGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
},
{
"#operator": "FinalGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`a`.`LogJobID`)"
},
{
"as": "LoggingTypeID",
"expr": "(`a`.`LoggingType`)"
},
{
"as": "AffectedLineCount",
"expr": "count(*)"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
"text": "select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount\nfrom (\n select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID\n from `LogBucket` a\n where LoggingType in [3001, 4004, 6002]\n group by LogFileID, RowKey) as a\ngroup by a.LoggingType, a.LogJobID"
}
它选择使用的索引是这样创建的:
CREATE INDEX `analyst_log_LogJob_activity` ON `LogBucket`(`LoggingType`,`LogJobID`) PARTITION BY hash((meta().`id`))
第二个索引的问题在于,它包含索引下的所有 10 亿个文档,而我尝试 create/dedicate 为这份新报告创建的文档将明显减少,因为 LoggingType where 子句。
您可以按如下方式创建覆盖索引。仅当所有查询使用相同的 LoggingType 值时才使用索引 WHERE 子句。
CREATE INDEX `data_job_productivity_index1` ON `LogBucket`
(`LoggingType`, `LogFileID`,`RowKey`,`CreateDate`,`SequenceID`, `LogJobID`)
PARTITION BY HASH(META().`id`) WHERE LoggingType IN [3001, 4004, 6002];
SELECT LogJobID, LoggingTypeID, COUNT(1) AS AffectedLineCount
FROM (
SELECT MAX([CreateDate, SequenceID, {LoggingTypeID:LoggingType,LogJobID} ])[2].*
FROM `LogBucket` AS a
WHERE LoggingType IN [3001, 4004, 6002]
GROUP BY LogFileID, RowKey) AS a
GROUP BY LoggingTypeID, LogJobID;
确保覆盖内部子查询并使用索引聚合 https://blog.couchbase.com/understanding-index-grouping-aggregation-couchbase-n1ql-query/
探索索引复制以获得高可用性和性能 https://docs.couchbase.com/server/current/learn/services-and-indexes/indexes/index-replication.html
如果 LoggingType、LogFileID、RowKey 不可变,则将它们作为分区键进行探索 而不是 META().id https://blog.couchbase.com/couchbase-gsi-index-partitioning/
https://blog.couchbase.com/create-right-index-get-right-performance/