SQL(大数据)中每行 d 天内获取过去 x 事件平均值的有效方法
Efficient way to get the average of past x events within d days per each row in SQL (big data)
我想找到最好和最有效的方法来计算 7 天内过去 2 场比赛的平均得分,并且每一行我都需要它。
我已经有一个适用于 6000 万行的查询,但在 100%(约 5 亿行)的数据上它崩溃了(可能效率不高或可能缺乏资源)。
你能帮我吗?如果您认为我的解决方案不是最好的方法,请解释。
谢谢
我有这个table:
user_id event_id start end score
---------------------------------------------------
1 7 30/01/2021 30/01/2021 45
1 6 24/01/2021 29/01/2021 25
1 5 22/01/2021 23/01/2021 13
1 4 18/01/2021 21/01/2021 15
1 3 17/01/2021 17/01/2021 52
1 2 08/01/2021 10/01/2021 8
1 1 01/01/2021 02/01/2021 36
我想要每行(用户id+事件id):得到最近7天过去2个事件的平均得分。
示例:对于这一行:
user_id event_id start end score
---------------------------------------------------
1 6 24/01/2021 29/01/2021 25
user_id event_id start end score past_7_days_from_start event_num
--------------------------------------------------------------------------------------
1 6 24/01/2021 29/01/2021 25 null null
1 5 22/01/2021 23/01/2021 13 yes 1
1 4 18/01/2021 21/01/2021 15 yes 2
1 3 17/01/2021 17/01/2021 52 yes 3
1 2 08/01/2021 10/01/2021 8 no 4
1 1 01/01/2021 02/01/2021 36 no 5
所以我会 select 只有这行作为分组依据,然后是 avg(score):
user_id event_id start end score past_7_days_from_start event_num
--------------------------------------------------------------------------------------
1 5 22/01/2021 23/01/2021 13 yes 1
1 4 18/01/2021 21/01/2021 15 yes 2
结果:
user_id event_id start end score avg_score_of_past_2_events_within_7_days
--------------------------------------------------------------------------------------
1 6 24/01/2021 29/01/2021 25 14
我的查询:
SELECT user_id, event_id, AVG(score) as avg_score_of_past_2_events_within_7_days
FROM (
SELECT
B.user_id, B.event_id, A.score,
ROW_NUMBER() OVER (PARTITION BY B.user_id, B.event_id ORDER BY A.end desc) AS event_num,
FROM
"df" A
INNER JOIN
(SELECT user_id, event_id, start FROM "df") B
ON B.user_id = FTP.user_id
AND (A.end BETWEEN DATE_SUB(B.start, INTERVAL 7 DAY) AND B.start))
WHERE event_num >= 2
GROUP BY user_id, event_id
有什么更好的建议吗?
我不相信你的情况,还有更高效的查询。
我可以建议您执行以下操作:
确保你的基础 table 是 partition 和 cluster user_id
将查询拆分为创建分区和集群的 3 个部分 tables:
- 第一个table:只有内连接O(n^2)
- 秒table:添加ROW_NUMBER O(n)
- 第三个table:分组依据
- 如果这仍然是个问题,我建议按日期进行 批量预处理 和 运行 查询。
我尝试使用 LEAD 函数创建一个用例,但我无法测试是否适用于该大型数据集。
我使用 LEAD 创建前两行 prev
和 ante
。
然后我有 7 天的 IF window,如果匹配,我创建 scorePP
和 scoreAA
,否则它们为空。
with t as (
select 1 as user_id,7 as event_id,parse_date('%d/%m/%Y','30/01/2021') as start,parse_date('%d/%m/%Y','30/01/2021') as stop, 45 as score union all
select 1 as user_id,6 as event_id,parse_date('%d/%m/%Y','24/01/2021') as start,parse_date('%d/%m/%Y','29/01/2021') as stop, 25 as score union all
select 1 as user_id,5 as event_id,parse_date('%d/%m/%Y','22/01/2021') as start,parse_date('%d/%m/%Y','23/01/2021') as stop, 13 as score union all
select 1 as user_id,4 as event_id,parse_date('%d/%m/%Y','18/01/2021') as start,parse_date('%d/%m/%Y','21/01/2021') as stop, 15 as score union all
select 1 as user_id,3 as event_id,parse_date('%d/%m/%Y','17/01/2021') as start,parse_date('%d/%m/%Y','17/01/2021') as stop, 52 as score union all
select 1 as user_id,2 as event_id,parse_date('%d/%m/%Y','08/01/2021') as start,parse_date('%d/%m/%Y','10/01/2021') as stop, 8 as score union all
select 1 as user_id,1 as event_id,parse_date('%d/%m/%Y','01/01/2021') as start,parse_date('%d/%m/%Y','02/01/2021') as stop, 36 as score union all
select 2 as user_id,3 as event_id,parse_date('%d/%m/%Y','12/01/2021') as start,parse_date('%d/%m/%Y','17/01/2021') as stop, 52 as score union all
select 2 as user_id,2 as event_id,parse_date('%d/%m/%Y','08/01/2021') as start,parse_date('%d/%m/%Y','10/01/2021') as stop, 8 as score union all
select 2 as user_id,1 as event_id,parse_date('%d/%m/%Y','01/01/2021') as start,parse_date('%d/%m/%Y','02/01/2021') as stop, 36 as score
)
select *, (select avg(x) from unnest([scorePP,scoreAA]) as x) as avg_score_7_day from (
SELECT
t.*,
lead(start,1) over(partition by user_id order by event_id desc, t.stop desc) prev_start,
lead(stop,1) over(partition by user_id order by event_id desc, t.stop desc) prev_stop,
lead(score,1) over(partition by user_id order by event_id desc, t.stop desc) prev_score,
if(((lead(start,1) over(partition by user_id order by event_id desc, t.stop desc)) between date_sub(start, interval 7 day) and (lead(stop,1) over(partition by user_id order by event_id desc, t.stop desc))),lead(score,1) over(partition by user_id order by event_id desc, t.stop desc),null) as scorePP,
/**/
lead(start,2) over(partition by user_id order by event_id desc, t.stop desc) ante_start,
lead(stop,2) over(partition by user_id order by event_id desc, t.stop desc) ante_stop,
lead(score,2) over(partition by user_id order by event_id desc, t.stop desc) ante_score,
if(((lead(start,2) over(partition by user_id order by event_id desc, t.stop desc)) between date_sub(start, interval 7 day) and (lead(stop,2) over(partition by user_id order by event_id desc, t.stop desc))),lead(score,2) over(partition by user_id order by event_id desc, t.stop desc),null) as scoreAA,
from
t
)
where coalesce(scorePP,scoreAA) is not null
order by user_id,event_id desc
考虑以下方法
select * except(candidates1, candidates2),
( select avg(score)
from (
select * from unnest(candidates1) union distinct
select * from unnest(candidates2)
order by event_id desc
limit 2
)
) as avg_score_of_past_2_events_within_7_days
from (
select *,
array_agg(struct(event_id, score)) over(order by unix_date(t.start) range between 7 preceding and 1 preceding) as candidates1,
array_agg(struct(event_id, score)) over(order by unix_date(t.end) range between 7 preceding and 1 preceding) as candidates2
from your_table t
)
如果应用于您问题中的示例数据 - 输出为
我想找到最好和最有效的方法来计算 7 天内过去 2 场比赛的平均得分,并且每一行我都需要它。 我已经有一个适用于 6000 万行的查询,但在 100%(约 5 亿行)的数据上它崩溃了(可能效率不高或可能缺乏资源)。 你能帮我吗?如果您认为我的解决方案不是最好的方法,请解释。 谢谢
我有这个table:
user_id event_id start end score
---------------------------------------------------
1 7 30/01/2021 30/01/2021 45
1 6 24/01/2021 29/01/2021 25
1 5 22/01/2021 23/01/2021 13
1 4 18/01/2021 21/01/2021 15
1 3 17/01/2021 17/01/2021 52
1 2 08/01/2021 10/01/2021 8
1 1 01/01/2021 02/01/2021 36
我想要每行(用户id+事件id):得到最近7天过去2个事件的平均得分。
示例:对于这一行:
user_id event_id start end score
---------------------------------------------------
1 6 24/01/2021 29/01/2021 25
user_id event_id start end score past_7_days_from_start event_num
--------------------------------------------------------------------------------------
1 6 24/01/2021 29/01/2021 25 null null
1 5 22/01/2021 23/01/2021 13 yes 1
1 4 18/01/2021 21/01/2021 15 yes 2
1 3 17/01/2021 17/01/2021 52 yes 3
1 2 08/01/2021 10/01/2021 8 no 4
1 1 01/01/2021 02/01/2021 36 no 5
所以我会 select 只有这行作为分组依据,然后是 avg(score):
user_id event_id start end score past_7_days_from_start event_num
--------------------------------------------------------------------------------------
1 5 22/01/2021 23/01/2021 13 yes 1
1 4 18/01/2021 21/01/2021 15 yes 2
结果:
user_id event_id start end score avg_score_of_past_2_events_within_7_days
--------------------------------------------------------------------------------------
1 6 24/01/2021 29/01/2021 25 14
我的查询:
SELECT user_id, event_id, AVG(score) as avg_score_of_past_2_events_within_7_days
FROM (
SELECT
B.user_id, B.event_id, A.score,
ROW_NUMBER() OVER (PARTITION BY B.user_id, B.event_id ORDER BY A.end desc) AS event_num,
FROM
"df" A
INNER JOIN
(SELECT user_id, event_id, start FROM "df") B
ON B.user_id = FTP.user_id
AND (A.end BETWEEN DATE_SUB(B.start, INTERVAL 7 DAY) AND B.start))
WHERE event_num >= 2
GROUP BY user_id, event_id
有什么更好的建议吗?
我不相信你的情况,还有更高效的查询。
我可以建议您执行以下操作:
确保你的基础 table 是 partition 和 cluster user_id
将查询拆分为创建分区和集群的 3 个部分 tables:
- 第一个table:只有内连接O(n^2)
- 秒table:添加ROW_NUMBER O(n)
- 第三个table:分组依据
- 如果这仍然是个问题,我建议按日期进行 批量预处理 和 运行 查询。
我尝试使用 LEAD 函数创建一个用例,但我无法测试是否适用于该大型数据集。
我使用 LEAD 创建前两行 prev
和 ante
。
然后我有 7 天的 IF window,如果匹配,我创建 scorePP
和 scoreAA
,否则它们为空。
with t as (
select 1 as user_id,7 as event_id,parse_date('%d/%m/%Y','30/01/2021') as start,parse_date('%d/%m/%Y','30/01/2021') as stop, 45 as score union all
select 1 as user_id,6 as event_id,parse_date('%d/%m/%Y','24/01/2021') as start,parse_date('%d/%m/%Y','29/01/2021') as stop, 25 as score union all
select 1 as user_id,5 as event_id,parse_date('%d/%m/%Y','22/01/2021') as start,parse_date('%d/%m/%Y','23/01/2021') as stop, 13 as score union all
select 1 as user_id,4 as event_id,parse_date('%d/%m/%Y','18/01/2021') as start,parse_date('%d/%m/%Y','21/01/2021') as stop, 15 as score union all
select 1 as user_id,3 as event_id,parse_date('%d/%m/%Y','17/01/2021') as start,parse_date('%d/%m/%Y','17/01/2021') as stop, 52 as score union all
select 1 as user_id,2 as event_id,parse_date('%d/%m/%Y','08/01/2021') as start,parse_date('%d/%m/%Y','10/01/2021') as stop, 8 as score union all
select 1 as user_id,1 as event_id,parse_date('%d/%m/%Y','01/01/2021') as start,parse_date('%d/%m/%Y','02/01/2021') as stop, 36 as score union all
select 2 as user_id,3 as event_id,parse_date('%d/%m/%Y','12/01/2021') as start,parse_date('%d/%m/%Y','17/01/2021') as stop, 52 as score union all
select 2 as user_id,2 as event_id,parse_date('%d/%m/%Y','08/01/2021') as start,parse_date('%d/%m/%Y','10/01/2021') as stop, 8 as score union all
select 2 as user_id,1 as event_id,parse_date('%d/%m/%Y','01/01/2021') as start,parse_date('%d/%m/%Y','02/01/2021') as stop, 36 as score
)
select *, (select avg(x) from unnest([scorePP,scoreAA]) as x) as avg_score_7_day from (
SELECT
t.*,
lead(start,1) over(partition by user_id order by event_id desc, t.stop desc) prev_start,
lead(stop,1) over(partition by user_id order by event_id desc, t.stop desc) prev_stop,
lead(score,1) over(partition by user_id order by event_id desc, t.stop desc) prev_score,
if(((lead(start,1) over(partition by user_id order by event_id desc, t.stop desc)) between date_sub(start, interval 7 day) and (lead(stop,1) over(partition by user_id order by event_id desc, t.stop desc))),lead(score,1) over(partition by user_id order by event_id desc, t.stop desc),null) as scorePP,
/**/
lead(start,2) over(partition by user_id order by event_id desc, t.stop desc) ante_start,
lead(stop,2) over(partition by user_id order by event_id desc, t.stop desc) ante_stop,
lead(score,2) over(partition by user_id order by event_id desc, t.stop desc) ante_score,
if(((lead(start,2) over(partition by user_id order by event_id desc, t.stop desc)) between date_sub(start, interval 7 day) and (lead(stop,2) over(partition by user_id order by event_id desc, t.stop desc))),lead(score,2) over(partition by user_id order by event_id desc, t.stop desc),null) as scoreAA,
from
t
)
where coalesce(scorePP,scoreAA) is not null
order by user_id,event_id desc
考虑以下方法
select * except(candidates1, candidates2),
( select avg(score)
from (
select * from unnest(candidates1) union distinct
select * from unnest(candidates2)
order by event_id desc
limit 2
)
) as avg_score_of_past_2_events_within_7_days
from (
select *,
array_agg(struct(event_id, score)) over(order by unix_date(t.start) range between 7 preceding and 1 preceding) as candidates1,
array_agg(struct(event_id, score)) over(order by unix_date(t.end) range between 7 preceding and 1 preceding) as candidates2
from your_table t
)
如果应用于您问题中的示例数据 - 输出为