尽管有索引,N1QL 慢速连接
N1QL slow join despite of indexes
我正在为一个 uni 项目试验 CB。我加载了一个名为 tweets 的存储桶和一个包含 5000000 个文档的存储桶和一个名为 users 的存储桶和 2000000 个文档。
每个 Tweet-doc 都有相同的架构,具有一些属性,如“is_retweet”(“真”或“假”)、“文本”(推文的文本)、“时间戳”和“ user_id" 这是发布推文的用户的 ID。
每个用户都有一个id并且可能有一个关注者列表(其他用户的id)。
我想创建一个查询来检索 20 条最常见的推文。这意味着我要查找那些具有字段“is_retweet”=“true”的推文以及拥有大量关注者的用户。
我已经在 ARRAY_LENGTH 关注者、属性“is_retweet”和“user_id”字段上创建了索引:
CREATE INDEX `idx_followers_length` ON `users`(array_length(`followers`)) WHERE (100 < array_length(`followers`))
CREATE INDEX `idx_retweet` ON `tweets`(`is_retweet`) WHERE (`is_retweet` = "true")
CREATE INDEX `idx_users_on_tweets` ON `tweets`(`user_id`)
多亏了这些索引,部分查询的执行速度相当快。 “部分查询”是:
根据“followers”数组的长度得到前20个关注最多的用户
SELECT id, ARRAY_LENGTH(followers) AS followers_num
FROM users
WHERE ARRAY_LENGTH(followers) > 100
ORDER BY ARRAY_LENGTH(followers) DESC
LIMIT 20
Select 已转发推文的文本和时间戳 (month/day)
SELECT DATE_PART_STR(MILLIS_TO_STR(TONUMBER(timestamp)), 'month') AS month,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(timestamp)), 'day') AS day,
text
FROM tweets
WHERE is_retweet = "true"
问题是当我尝试连接时,查询一直运行(超过 30 分钟)。这是查询(可能是错误的)和解释(查询使用了上面提到的所有索引):
SELECT u.id,
u.followers_num,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'month') AS month,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'day') AS day,
t.text
FROM tweets AS t
JOIN (SELECT id, ARRAY_LENGTH(followers) AS followers_num
FROM users
WHERE ARRAY_LENGTH(followers) > 100)
AS u ON t.user_id = META(u).id
WHERE t.is_retweet = "true"
ORDER BY u.followers_num DESC
LIMIT 20
{
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IntersectScan",
"scans": [
{
"#operator": "IndexScan3",
"as": "t",
"index": "idx_retweet",
"index_id": "437d590a2e220ed4",
"index_projection": {
"primary_key": true
},
"keyspace": "tweets",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"high": "\"true\"",
"inclusion": 3,
"low": "\"true\""
}
]
}
],
"using": "gsi"
},
{
"#operator": "IndexScan3",
"as": "t",
"index": "idx_users_on_tweets",
"index_id": "c93f6f0be887553",
"index_projection": {
"primary_key": true
},
"keyspace": "tweets",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"inclusion": 0,
"low": "null"
}
]
}
],
"using": "gsi"
}
]
},
{
"#operator": "Fetch",
"as": "t",
"keyspace": "tweets",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "NestedLoopJoin",
"alias": "u",
"on_clause": "((`t`.`user_id`) = (meta(`u`).`id`))",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IndexScan3",
"index": "idx_followers_length_2",
"index_id": "b5cc45b51847b40f",
"index_projection": {
"primary_key": true
},
"keyspace": "users",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"inclusion": 0,
"low": "100"
}
]
}
],
"using": "gsi"
},
{
"#operator": "Fetch",
"keyspace": "users",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "(100 < array_length((`users`.`followers`)))"
},
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`users`.`id`)"
},
{
"as": "followers_num",
"expr": "array_length((`users`.`followers`))"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
{
"#operator": "Alias",
"as": "u"
}
]
}
},
{
"#operator": "Filter",
"condition": "((`t`.`is_retweet`) = \"true\")"
},
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`u`.`id`)"
},
{
"expr": "(`u`.`followers_num`)"
},
{
"as": "month",
"expr": "date_part_str(millis_to_str(to_number((`t`.`timestamp`))), \"month\")"
},
{
"as": "day",
"expr": "date_part_str(millis_to_str(to_number((`t`.`timestamp`))), \"day\")"
},
{
"expr": "(`t`.`text`)"
}
]
}
]
}
}
]
},
{
"#operator": "Order",
"limit": "20",
"sort_terms": [
{
"desc": true,
"expr": "(`u`.`followers_num`)"
}
]
},
{
"#operator": "Limit",
"expr": "20"
},
{
"#operator": "FinalProject"
}
]
}
我在 Couchbase Server 7 的本地实例中尝试了索引顾问,其中包含您已经创建的索引。它推荐了另一个索引:
CREATE INDEX adv_is_retweet_user_id ON `tweets`(`is_retweet`,`user_id`)
所以,您可能想尝试一下。 (我没有你的数据集,所以我不知道它是否有帮助,或者有多大帮助)。
但是,如果您计划在同一个数据集上创建更复杂的查询,尤其是临时查询,您可能需要考虑使用 Analytics service。它也支持 N1QL 和索引,但它通常会为您提供相当不错的性能,而无需预先创建索引。
N1QL 联接从左到右。并且您有带 ORDER BY 的分页查询,即使您需要 20 个项目,它也必须生成所有可能的值。
CREATE INDEX `ix1` ON `users`(array_length(`followers`) DESC) WHERE (100 < array_length(`followers`));
CREATE INDEX `ix2` ON `tweets`(user_id, timestamp, text) WHERE (`is_retweet` = "true");
SELECT u.id,
u.followers_num,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'month') AS month,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'day') AS day,
t.text
FROM (SELECT META().id, ARRAY_LENGTH(followers) AS followers_num
FROM users
WHERE ARRAY_LENGTH(followers) > 100
ORDER BY ARRAY_LENGTH(followers) DESC) AS u
JOIN tweets AS t ON u.id = t.user_id
WHERE t.is_retweet = "true"
LIMIT 20;
子查询使用具有索引顺序的覆盖索引(避免排序)并以相反的顺序产生最高的跟随者。然后它加入推文产生 20 条推文。
注意:这会产生 20 次转推。 JOIN可以产生1对多。它不能是前 20 位关注者。如果您需要,根据您的需要,您可能需要 GROUP BY 或使用 NEST(示例 17 https://blog.couchbase.com/ansi-join-support-n1ql/)。在 CB 7.0 中,您可以使用通用相关子查询。
我正在为一个 uni 项目试验 CB。我加载了一个名为 tweets 的存储桶和一个包含 5000000 个文档的存储桶和一个名为 users 的存储桶和 2000000 个文档。
每个 Tweet-doc 都有相同的架构,具有一些属性,如“is_retweet”(“真”或“假”)、“文本”(推文的文本)、“时间戳”和“ user_id" 这是发布推文的用户的 ID。
每个用户都有一个id并且可能有一个关注者列表(其他用户的id)。
我想创建一个查询来检索 20 条最常见的推文。这意味着我要查找那些具有字段“is_retweet”=“true”的推文以及拥有大量关注者的用户。
我已经在 ARRAY_LENGTH 关注者、属性“is_retweet”和“user_id”字段上创建了索引:
CREATE INDEX `idx_followers_length` ON `users`(array_length(`followers`)) WHERE (100 < array_length(`followers`))
CREATE INDEX `idx_retweet` ON `tweets`(`is_retweet`) WHERE (`is_retweet` = "true")
CREATE INDEX `idx_users_on_tweets` ON `tweets`(`user_id`)
多亏了这些索引,部分查询的执行速度相当快。 “部分查询”是:
根据“followers”数组的长度得到前20个关注最多的用户
SELECT id, ARRAY_LENGTH(followers) AS followers_num FROM users WHERE ARRAY_LENGTH(followers) > 100 ORDER BY ARRAY_LENGTH(followers) DESC LIMIT 20
Select 已转发推文的文本和时间戳 (month/day)
SELECT DATE_PART_STR(MILLIS_TO_STR(TONUMBER(timestamp)), 'month') AS month, DATE_PART_STR(MILLIS_TO_STR(TONUMBER(timestamp)), 'day') AS day, text FROM tweets WHERE is_retweet = "true"
问题是当我尝试连接时,查询一直运行(超过 30 分钟)。这是查询(可能是错误的)和解释(查询使用了上面提到的所有索引):
SELECT u.id,
u.followers_num,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'month') AS month,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'day') AS day,
t.text
FROM tweets AS t
JOIN (SELECT id, ARRAY_LENGTH(followers) AS followers_num
FROM users
WHERE ARRAY_LENGTH(followers) > 100)
AS u ON t.user_id = META(u).id
WHERE t.is_retweet = "true"
ORDER BY u.followers_num DESC
LIMIT 20
{
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IntersectScan",
"scans": [
{
"#operator": "IndexScan3",
"as": "t",
"index": "idx_retweet",
"index_id": "437d590a2e220ed4",
"index_projection": {
"primary_key": true
},
"keyspace": "tweets",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"high": "\"true\"",
"inclusion": 3,
"low": "\"true\""
}
]
}
],
"using": "gsi"
},
{
"#operator": "IndexScan3",
"as": "t",
"index": "idx_users_on_tweets",
"index_id": "c93f6f0be887553",
"index_projection": {
"primary_key": true
},
"keyspace": "tweets",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"inclusion": 0,
"low": "null"
}
]
}
],
"using": "gsi"
}
]
},
{
"#operator": "Fetch",
"as": "t",
"keyspace": "tweets",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "NestedLoopJoin",
"alias": "u",
"on_clause": "((`t`.`user_id`) = (meta(`u`).`id`))",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IndexScan3",
"index": "idx_followers_length_2",
"index_id": "b5cc45b51847b40f",
"index_projection": {
"primary_key": true
},
"keyspace": "users",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"inclusion": 0,
"low": "100"
}
]
}
],
"using": "gsi"
},
{
"#operator": "Fetch",
"keyspace": "users",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "(100 < array_length((`users`.`followers`)))"
},
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`users`.`id`)"
},
{
"as": "followers_num",
"expr": "array_length((`users`.`followers`))"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
{
"#operator": "Alias",
"as": "u"
}
]
}
},
{
"#operator": "Filter",
"condition": "((`t`.`is_retweet`) = \"true\")"
},
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`u`.`id`)"
},
{
"expr": "(`u`.`followers_num`)"
},
{
"as": "month",
"expr": "date_part_str(millis_to_str(to_number((`t`.`timestamp`))), \"month\")"
},
{
"as": "day",
"expr": "date_part_str(millis_to_str(to_number((`t`.`timestamp`))), \"day\")"
},
{
"expr": "(`t`.`text`)"
}
]
}
]
}
}
]
},
{
"#operator": "Order",
"limit": "20",
"sort_terms": [
{
"desc": true,
"expr": "(`u`.`followers_num`)"
}
]
},
{
"#operator": "Limit",
"expr": "20"
},
{
"#operator": "FinalProject"
}
]
}
我在 Couchbase Server 7 的本地实例中尝试了索引顾问,其中包含您已经创建的索引。它推荐了另一个索引:
CREATE INDEX adv_is_retweet_user_id ON `tweets`(`is_retweet`,`user_id`)
所以,您可能想尝试一下。 (我没有你的数据集,所以我不知道它是否有帮助,或者有多大帮助)。
但是,如果您计划在同一个数据集上创建更复杂的查询,尤其是临时查询,您可能需要考虑使用 Analytics service。它也支持 N1QL 和索引,但它通常会为您提供相当不错的性能,而无需预先创建索引。
N1QL 联接从左到右。并且您有带 ORDER BY 的分页查询,即使您需要 20 个项目,它也必须生成所有可能的值。
CREATE INDEX `ix1` ON `users`(array_length(`followers`) DESC) WHERE (100 < array_length(`followers`));
CREATE INDEX `ix2` ON `tweets`(user_id, timestamp, text) WHERE (`is_retweet` = "true");
SELECT u.id,
u.followers_num,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'month') AS month,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'day') AS day,
t.text
FROM (SELECT META().id, ARRAY_LENGTH(followers) AS followers_num
FROM users
WHERE ARRAY_LENGTH(followers) > 100
ORDER BY ARRAY_LENGTH(followers) DESC) AS u
JOIN tweets AS t ON u.id = t.user_id
WHERE t.is_retweet = "true"
LIMIT 20;
子查询使用具有索引顺序的覆盖索引(避免排序)并以相反的顺序产生最高的跟随者。然后它加入推文产生 20 条推文。
注意:这会产生 20 次转推。 JOIN可以产生1对多。它不能是前 20 位关注者。如果您需要,根据您的需要,您可能需要 GROUP BY 或使用 NEST(示例 17 https://blog.couchbase.com/ansi-join-support-n1ql/)。在 CB 7.0 中,您可以使用通用相关子查询。