BigQuery - 组合碎片事件
BigQuery - combine fragmented events
这是一个示例数据:
create table activity as
select "2020-02-25T09:06:12" as datetime_start, "2020-02-25T09:07:31" as datetime_end, 0 as flag uniuon all
select "2020-02-25T09:16:08" as datetime_start, "2020-02-25T09:17:31" as datetime_end, 0 as flag uniuon all
select "2020-02-25T09:17:31" as datetime_start, "2020-02-25T09:27:31" as datetime_end, 1 as flag uniuon all
select "2020-02-25T09:27:31" as datetime_start, "2020-02-25T09:32:41" as datetime_end, 1 as flag uniuon all
select "2020-02-25T09:35:57" as datetime_start, "2020-02-25T09:37:31" as datetime_end, 0 as flag uniuon all
select "2020-02-25T09:49:23" as datetime_start, "2020-02-25T09:51:16" as datetime_end, 0 as flag uniuon all
select "2020-02-25T09:51:16" as datetime_start, "2020-02-25T10:03:46" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:03:46" as datetime_start, "2020-02-25T10:05:57" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:05:57" as datetime_start, "2020-02-25T10:07:31" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:07:31" as datetime_start, "2020-02-25T10:10:22" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:10:22" as datetime_start, "2020-02-25T10:12:55" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:12:55" as datetime_start, "2020-02-25T10:20:17" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:20:17" as datetime_start, "2020-02-25T10:27:40" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:27:40" as datetime_start, "2020-02-25T10:39:51" as datetime_end, 1 as flag;
我正在寻找将根据标志列计算 activity 块的查询。
如果标志设置为 1,那么它之后的行直到标志更改为 0 需要组合成一个 activity 块。
以上示例产生 6 activity 个块。
- 2020-02-25T09:06:12 - 2020-02-25T09:07:31
- 2020-02-25T09:16:08 - 2020-02-25T09:17:31
- 2020-02-25T09:17:31 - 2020-02-25T09:32:41
- 2020-02-25T09:35:57 - 2020-02-25T09:37:31
- 2020-02-25T09:49:23 - 2020-02-25T09:51:16
- 2020-02-25T09:51:16 - 2020-02-25T10:39:51
这是一个缺口和孤岛的变体。这是一种使用 lag()
和 window 总和来定义连续 1
组的方法:
select
min(datetime_start) datetime_stat,
max(datetime_end) datetime_end,
flag
from (
select
t.*,
sum(case when flag = 1 and lag_flag = 1 then 0 else 1 end)
over(order by datetime_start) grp
from (
select
t.*,
lag(flag) over(order by datetime_start) lag_flag
from mytable t
) t
) t
group by flag, grp
这回答了问题的原始版本。
GMB 的答案可能有效,但它似乎是定制的,因为它硬编码了标志的值。我更喜欢更通用的方法:
with activity as (
select "2020-02-25T09:06:12" as datetime_start, "2020-02-25T09:07:31" as datetime_end, 0 as flag union all
select "2020-02-25T09:16:08" as datetime_start, "2020-02-25T09:17:31" as datetime_end, 0 as flag union all
select "2020-02-25T09:17:31" as datetime_start, "2020-02-25T09:27:31" as datetime_end, 1 as flag union all
select "2020-02-25T09:27:31" as datetime_start, "2020-02-25T09:32:41" as datetime_end, 1 as flag union all
select "2020-02-25T09:35:57" as datetime_start, "2020-02-25T09:37:31" as datetime_end, 0 as flag union all
select "2020-02-25T09:49:23" as datetime_start, "2020-02-25T09:51:16" as datetime_end, 0 as flag union all
select "2020-02-25T09:51:16" as datetime_start, "2020-02-25T10:03:46" as datetime_end, 1 as flag union all
select "2020-02-25T10:03:46" as datetime_start, "2020-02-25T10:05:57" as datetime_end, 1 as flag union all
select "2020-02-25T10:05:57" as datetime_start, "2020-02-25T10:07:31" as datetime_end, 1 as flag union all
select "2020-02-25T10:07:31" as datetime_start, "2020-02-25T10:10:22" as datetime_end, 1 as flag union all
select "2020-02-25T10:10:22" as datetime_start, "2020-02-25T10:12:55" as datetime_end, 1 as flag union all
select "2020-02-25T10:12:55" as datetime_start, "2020-02-25T10:20:17" as datetime_end, 1 as flag union all
select "2020-02-25T10:20:17" as datetime_start, "2020-02-25T10:27:40" as datetime_end, 1 as flag union all
select "2020-02-25T10:27:40" as datetime_start, "2020-02-25T10:39:51" as datetime_end, 1 as flag
)
select min(datetime_start) as datetime_stat,
max(datetime_end) as datetime_end,
flag
from (select a.*,
countif( datetime_start <> prev_datetime_end OR
prev_flag <> flag
) over (order by datetime_start) as grp
from (select a.*,
lag(flag) over (order by datetime_start) as prev_flag,
lag(datetime_end) over (order by datetime_start) as prev_datetime_end
from activity a
) a
) t
group by flag, grp
这是一个示例数据:
create table activity as
select "2020-02-25T09:06:12" as datetime_start, "2020-02-25T09:07:31" as datetime_end, 0 as flag uniuon all
select "2020-02-25T09:16:08" as datetime_start, "2020-02-25T09:17:31" as datetime_end, 0 as flag uniuon all
select "2020-02-25T09:17:31" as datetime_start, "2020-02-25T09:27:31" as datetime_end, 1 as flag uniuon all
select "2020-02-25T09:27:31" as datetime_start, "2020-02-25T09:32:41" as datetime_end, 1 as flag uniuon all
select "2020-02-25T09:35:57" as datetime_start, "2020-02-25T09:37:31" as datetime_end, 0 as flag uniuon all
select "2020-02-25T09:49:23" as datetime_start, "2020-02-25T09:51:16" as datetime_end, 0 as flag uniuon all
select "2020-02-25T09:51:16" as datetime_start, "2020-02-25T10:03:46" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:03:46" as datetime_start, "2020-02-25T10:05:57" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:05:57" as datetime_start, "2020-02-25T10:07:31" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:07:31" as datetime_start, "2020-02-25T10:10:22" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:10:22" as datetime_start, "2020-02-25T10:12:55" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:12:55" as datetime_start, "2020-02-25T10:20:17" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:20:17" as datetime_start, "2020-02-25T10:27:40" as datetime_end, 1 as flag uniuon all
select "2020-02-25T10:27:40" as datetime_start, "2020-02-25T10:39:51" as datetime_end, 1 as flag;
我正在寻找将根据标志列计算 activity 块的查询。
如果标志设置为 1,那么它之后的行直到标志更改为 0 需要组合成一个 activity 块。
以上示例产生 6 activity 个块。
- 2020-02-25T09:06:12 - 2020-02-25T09:07:31
- 2020-02-25T09:16:08 - 2020-02-25T09:17:31
- 2020-02-25T09:17:31 - 2020-02-25T09:32:41
- 2020-02-25T09:35:57 - 2020-02-25T09:37:31
- 2020-02-25T09:49:23 - 2020-02-25T09:51:16
- 2020-02-25T09:51:16 - 2020-02-25T10:39:51
这是一个缺口和孤岛的变体。这是一种使用 lag()
和 window 总和来定义连续 1
组的方法:
select
min(datetime_start) datetime_stat,
max(datetime_end) datetime_end,
flag
from (
select
t.*,
sum(case when flag = 1 and lag_flag = 1 then 0 else 1 end)
over(order by datetime_start) grp
from (
select
t.*,
lag(flag) over(order by datetime_start) lag_flag
from mytable t
) t
) t
group by flag, grp
这回答了问题的原始版本。
GMB 的答案可能有效,但它似乎是定制的,因为它硬编码了标志的值。我更喜欢更通用的方法:
with activity as (
select "2020-02-25T09:06:12" as datetime_start, "2020-02-25T09:07:31" as datetime_end, 0 as flag union all
select "2020-02-25T09:16:08" as datetime_start, "2020-02-25T09:17:31" as datetime_end, 0 as flag union all
select "2020-02-25T09:17:31" as datetime_start, "2020-02-25T09:27:31" as datetime_end, 1 as flag union all
select "2020-02-25T09:27:31" as datetime_start, "2020-02-25T09:32:41" as datetime_end, 1 as flag union all
select "2020-02-25T09:35:57" as datetime_start, "2020-02-25T09:37:31" as datetime_end, 0 as flag union all
select "2020-02-25T09:49:23" as datetime_start, "2020-02-25T09:51:16" as datetime_end, 0 as flag union all
select "2020-02-25T09:51:16" as datetime_start, "2020-02-25T10:03:46" as datetime_end, 1 as flag union all
select "2020-02-25T10:03:46" as datetime_start, "2020-02-25T10:05:57" as datetime_end, 1 as flag union all
select "2020-02-25T10:05:57" as datetime_start, "2020-02-25T10:07:31" as datetime_end, 1 as flag union all
select "2020-02-25T10:07:31" as datetime_start, "2020-02-25T10:10:22" as datetime_end, 1 as flag union all
select "2020-02-25T10:10:22" as datetime_start, "2020-02-25T10:12:55" as datetime_end, 1 as flag union all
select "2020-02-25T10:12:55" as datetime_start, "2020-02-25T10:20:17" as datetime_end, 1 as flag union all
select "2020-02-25T10:20:17" as datetime_start, "2020-02-25T10:27:40" as datetime_end, 1 as flag union all
select "2020-02-25T10:27:40" as datetime_start, "2020-02-25T10:39:51" as datetime_end, 1 as flag
)
select min(datetime_start) as datetime_stat,
max(datetime_end) as datetime_end,
flag
from (select a.*,
countif( datetime_start <> prev_datetime_end OR
prev_flag <> flag
) over (order by datetime_start) as grp
from (select a.*,
lag(flag) over (order by datetime_start) as prev_flag,
lag(datetime_end) over (order by datetime_start) as prev_datetime_end
from activity a
) a
) t
group by flag, grp