Redshift 中的分组查询需要大量时间
Grouping query in Redshift takes huge amount of time
我有以下要求:我有一个 table 格式如下。
我希望它变成这样:
基本上我想要具有各种活动组合的用户数量
我想要这种格式,因为我想用它创建 TreeMap 可视化。
这就是我到目前为止所做的。
首先找出 activity 组
的用户数
WITH lookup AS
(
SELECT listagg(name,',') AS groupings,
processed_date,
guid
FROM warehouse.test
GROUP BY processed_date,
guid
)
SELECT groupings AS activity_groupings,
LENGTH(groupings) -LENGTH(REPLACE(groupings,',','')) + 1 AS count,
processed_date,
COUNT( guid) AS users
FROM lookup
GROUP BY processed_date,
groupings
我把结果放在一个单独的table
然后,我像这样进行拆分和合并:
SELECT NULLIF(SPLIT_PART(groupings,',', 1),'') AS grouping_1,
COALESCE(NULLIF(SPLIT_PART(groupings,',', 2),''), grouping_1) AS grouping_2,
COALESCE(NULLIF(SPLIT_PART(groupings,',', 3),''), grouping_2, grouping_1) AS grouping_3,
num_users
FROM warehouse.groupings) AS expr_qry
GROUP BY grouping_1,
grouping_2,
grouping_3
问题是第一个查询需要超过 90 分钟才能执行,因为我有超过 250M 行。
一定有更好更有效的方法来解决这个问题。
如有任何提醒,我们将不胜感激。
谢谢
您不需要使用复杂的字符串操作函数(LISTAGG()
、SPLIT_PART()
)。您可以使用 ROW_NUMBER()
函数和简单的聚合来实现您的目标。
-- Create sample data
CREATE TEMP TABLE test_data (id, guid, name)
AS SELECT 1::INT, 1::INT, 'cooking'
UNION ALL SELECT 2::INT, 1::INT, 'cleaning'
UNION ALL SELECT 3::INT, 2::INT, 'washing'
UNION ALL SELECT 4::INT, 4::INT, 'cooking'
UNION ALL SELECT 6::INT, 5::INT, 'cooking'
UNION ALL SELECT 7::INT, 3::INT, 'cooking'
UNION ALL SELECT 8::INT, 3::INT, 'cleaning'
;
-- Assign a row number to each name per guid
WITH name_order AS (
SELECT guid
, name
, ROW_NUMBER() OVER(PARTITION BY guid ORDER BY id) row_n
FROM test_data
) -- Use MAX() to collapse each guid's data to 1 row
, groupings AS (
SELECT guid
, MAX(CASE WHEN row_n = 1 THEN name END) grouping_1
, MAX(CASE WHEN row_n = 2 THEN name END) grouping_2
FROM name_order
GROUP BY guid
) -- Count the guids per each grouping
SELECT grouping_1
, COALESCE(grouping_2, grouping_1) AS grouping_2
, COUNT(guid) num_users
FROM groupings
GROUP BY 1,2
;
-- Output
grouping_1 | grouping_2 | num_users
------------+------------+-----------
washing | washing | 1
cooking | cleaning | 2
cooking | cooking | 2
我有以下要求:我有一个 table 格式如下。
我希望它变成这样:
基本上我想要具有各种活动组合的用户数量
我想要这种格式,因为我想用它创建 TreeMap 可视化。
这就是我到目前为止所做的。 首先找出 activity 组
的用户数WITH lookup AS
(
SELECT listagg(name,',') AS groupings,
processed_date,
guid
FROM warehouse.test
GROUP BY processed_date,
guid
)
SELECT groupings AS activity_groupings,
LENGTH(groupings) -LENGTH(REPLACE(groupings,',','')) + 1 AS count,
processed_date,
COUNT( guid) AS users
FROM lookup
GROUP BY processed_date,
groupings
我把结果放在一个单独的table
然后,我像这样进行拆分和合并:
SELECT NULLIF(SPLIT_PART(groupings,',', 1),'') AS grouping_1,
COALESCE(NULLIF(SPLIT_PART(groupings,',', 2),''), grouping_1) AS grouping_2,
COALESCE(NULLIF(SPLIT_PART(groupings,',', 3),''), grouping_2, grouping_1) AS grouping_3,
num_users
FROM warehouse.groupings) AS expr_qry
GROUP BY grouping_1,
grouping_2,
grouping_3
问题是第一个查询需要超过 90 分钟才能执行,因为我有超过 250M 行。
一定有更好更有效的方法来解决这个问题。 如有任何提醒,我们将不胜感激。
谢谢
您不需要使用复杂的字符串操作函数(LISTAGG()
、SPLIT_PART()
)。您可以使用 ROW_NUMBER()
函数和简单的聚合来实现您的目标。
-- Create sample data
CREATE TEMP TABLE test_data (id, guid, name)
AS SELECT 1::INT, 1::INT, 'cooking'
UNION ALL SELECT 2::INT, 1::INT, 'cleaning'
UNION ALL SELECT 3::INT, 2::INT, 'washing'
UNION ALL SELECT 4::INT, 4::INT, 'cooking'
UNION ALL SELECT 6::INT, 5::INT, 'cooking'
UNION ALL SELECT 7::INT, 3::INT, 'cooking'
UNION ALL SELECT 8::INT, 3::INT, 'cleaning'
;
-- Assign a row number to each name per guid
WITH name_order AS (
SELECT guid
, name
, ROW_NUMBER() OVER(PARTITION BY guid ORDER BY id) row_n
FROM test_data
) -- Use MAX() to collapse each guid's data to 1 row
, groupings AS (
SELECT guid
, MAX(CASE WHEN row_n = 1 THEN name END) grouping_1
, MAX(CASE WHEN row_n = 2 THEN name END) grouping_2
FROM name_order
GROUP BY guid
) -- Count the guids per each grouping
SELECT grouping_1
, COALESCE(grouping_2, grouping_1) AS grouping_2
, COUNT(guid) num_users
FROM groupings
GROUP BY 1,2
;
-- Output
grouping_1 | grouping_2 | num_users
------------+------------+-----------
washing | washing | 1
cooking | cleaning | 2
cooking | cooking | 2