折叠 window 间隔
Collapsing window intervals
我有如下数据库:
select * from (
values ('A', 1, 2), ('A', 2, 3), ('A', 3, 4),
('B', 4, 5), ('B', 5, 6), ('A', 6, 7),
('C', 7, 8), ('C', 8, 9)
) example_table("state", "start", "end")
-- example table:
-- state start stop
-- A t1 t2
-- A t2 t3
-- A t3 t4
-- B t4 t5
-- B t5 t6
-- A t6 t7
-- C t7 t8
-- C t8 t9
我想通过 state
折叠间隔,同时也尊重干预状态切换:
state start stop
A t1 t4
B t4 t6
A t6 t7
C t7 t9
仅使用 group by state
是行不通的,因为给定的 state
.
的输出中有多行
这似乎是 window functions 的用例,但我不确定按什么对行进行分区。
我想创建一个 group_id
作为中间步骤:
group state start stop
1 A t1 t2
1 A t2 t3
1 A t3 t4
2 B t4 t5
2 B t5 t6
3 A t6 t7
4 C t7 t8
4 C t8 t9
然后我可以按 group
和 select min(start)
和 max(stop)
分组,但我不知道如何(有效地)创建此变量。在 R 中,我会使用 rle
函数来执行此操作,但我不知道任何 Presto 等价物。
此答案已更新以反映以下成功答案
with example_table("state", "start", "end") as (
values ('A', 1, 2), ('A', 2, 3), ('A', 3, 4),
('B', 4, 5), ('B', 5, 6), ('A', 6, 7),
('C', 7, 8), ('C', 8, 9)
), table_with_lags as (
-- detect state changes by observing the lagged value
select *, lag(state) over(order by start) as lag_state,
-- need to track the final value since it may be lost below
last_value("end") over(order by start rows between
0 preceding and unbounded following)
as end_period
from example_table
)
select state, start,
-- force-re-establish the start(+1) = end(0) link;
-- at the end of the period, override this with the
-- final observed value instead of null
lead(start, 1, end_period) over(order by start) as "end"
from table_with_lags
-- lag_state will be null for the first row
where state <> lag_state or lag_state is null
order by start
输出:
state start stop
A 1 4
B 4 6
A 6 7
C 7 9
rows between 0 preceding and unbounded following
位有点冗长,因此您也可以翻转逻辑并执行以下操作:
table_with_leads as (
select state, start, "end",
lead(state) over(order by start) as lead_state,
first_value(start) over(order by start) as start_period
from example_table
)
select state, lag("end", 1, start_period) over(order by start) as start, "end"
from table_with_lags
where state <> lead_state or lead_state is null
order by start
原回答
以下有效,但在规模上表现不佳(即使在 10% 的数据子样本上,我也会收到 "exceeded local memory limit" 错误):
with switches as (
-- coalesce since the first row will be NULL, need it false
select *, coalesce(state <> lag(state) over(order by start), false) switched
from (
values ('A', 1, 2), ('A', 2, 3), ('A', 3, 4),
('B', 4, 5), ('B', 5, 6), ('A', 6, 7),
('C', 7, 8), ('C', 8, 9)
) example_table("state", "start", "stop")
), groups as (
-- create the group ID as the accumulation of the state switches
-- since only one state switch can happen per group
select *, sum(cast(switched as bigint)) over (order by start) group_id
from switches
)
select min(state) state, min(start) start, max(stop) stop
from groups group by group_id order by start;
-- state start stop
-- A 1 4
-- B 4 6
-- A 6 7
-- C 7 9
我目前通过暂时将groups
存储为table,然后从groups
中单独select
找到了成功,这似乎解决了 RAM 问题(令我有些惊讶)。这似乎不太理想,但由于它完成了工作,我现在很满足于坚持使用它。
我有如下数据库:
select * from (
values ('A', 1, 2), ('A', 2, 3), ('A', 3, 4),
('B', 4, 5), ('B', 5, 6), ('A', 6, 7),
('C', 7, 8), ('C', 8, 9)
) example_table("state", "start", "end")
-- example table:
-- state start stop
-- A t1 t2
-- A t2 t3
-- A t3 t4
-- B t4 t5
-- B t5 t6
-- A t6 t7
-- C t7 t8
-- C t8 t9
我想通过 state
折叠间隔,同时也尊重干预状态切换:
state start stop
A t1 t4
B t4 t6
A t6 t7
C t7 t9
仅使用 group by state
是行不通的,因为给定的 state
.
这似乎是 window functions 的用例,但我不确定按什么对行进行分区。
我想创建一个 group_id
作为中间步骤:
group state start stop
1 A t1 t2
1 A t2 t3
1 A t3 t4
2 B t4 t5
2 B t5 t6
3 A t6 t7
4 C t7 t8
4 C t8 t9
然后我可以按 group
和 select min(start)
和 max(stop)
分组,但我不知道如何(有效地)创建此变量。在 R 中,我会使用 rle
函数来执行此操作,但我不知道任何 Presto 等价物。
此答案已更新以反映以下成功答案
with example_table("state", "start", "end") as (
values ('A', 1, 2), ('A', 2, 3), ('A', 3, 4),
('B', 4, 5), ('B', 5, 6), ('A', 6, 7),
('C', 7, 8), ('C', 8, 9)
), table_with_lags as (
-- detect state changes by observing the lagged value
select *, lag(state) over(order by start) as lag_state,
-- need to track the final value since it may be lost below
last_value("end") over(order by start rows between
0 preceding and unbounded following)
as end_period
from example_table
)
select state, start,
-- force-re-establish the start(+1) = end(0) link;
-- at the end of the period, override this with the
-- final observed value instead of null
lead(start, 1, end_period) over(order by start) as "end"
from table_with_lags
-- lag_state will be null for the first row
where state <> lag_state or lag_state is null
order by start
输出:
state start stop
A 1 4
B 4 6
A 6 7
C 7 9
rows between 0 preceding and unbounded following
位有点冗长,因此您也可以翻转逻辑并执行以下操作:
table_with_leads as (
select state, start, "end",
lead(state) over(order by start) as lead_state,
first_value(start) over(order by start) as start_period
from example_table
)
select state, lag("end", 1, start_period) over(order by start) as start, "end"
from table_with_lags
where state <> lead_state or lead_state is null
order by start
原回答
以下有效,但在规模上表现不佳(即使在 10% 的数据子样本上,我也会收到 "exceeded local memory limit" 错误):
with switches as (
-- coalesce since the first row will be NULL, need it false
select *, coalesce(state <> lag(state) over(order by start), false) switched
from (
values ('A', 1, 2), ('A', 2, 3), ('A', 3, 4),
('B', 4, 5), ('B', 5, 6), ('A', 6, 7),
('C', 7, 8), ('C', 8, 9)
) example_table("state", "start", "stop")
), groups as (
-- create the group ID as the accumulation of the state switches
-- since only one state switch can happen per group
select *, sum(cast(switched as bigint)) over (order by start) group_id
from switches
)
select min(state) state, min(start) start, max(stop) stop
from groups group by group_id order by start;
-- state start stop
-- A 1 4
-- B 4 6
-- A 6 7
-- C 7 9
我目前通过暂时将groups
存储为table,然后从groups
中单独select
找到了成功,这似乎解决了 RAM 问题(令我有些惊讶)。这似乎不太理想,但由于它完成了工作,我现在很满足于坚持使用它。