BigQuery - 组合碎片事件

BigQuery - combine fragmented events

这是一个示例数据:

create table activity as
select "2020-02-25T09:06:12" as datetime_start,  "2020-02-25T09:07:31" as datetime_end, 0 as flag uniuon all 
select "2020-02-25T09:16:08" as datetime_start,  "2020-02-25T09:17:31" as datetime_end, 0 as flag uniuon all 
select "2020-02-25T09:17:31" as datetime_start,  "2020-02-25T09:27:31" as datetime_end, 1 as flag uniuon all 
select "2020-02-25T09:27:31" as datetime_start,  "2020-02-25T09:32:41" as datetime_end, 1 as flag uniuon all 
select "2020-02-25T09:35:57" as datetime_start,  "2020-02-25T09:37:31" as datetime_end, 0 as flag uniuon all 
select "2020-02-25T09:49:23" as datetime_start,  "2020-02-25T09:51:16" as datetime_end, 0 as flag uniuon all 
select "2020-02-25T09:51:16" as datetime_start,  "2020-02-25T10:03:46" as datetime_end, 1 as flag uniuon all 
select "2020-02-25T10:03:46" as datetime_start,  "2020-02-25T10:05:57" as datetime_end, 1 as flag uniuon all 
select "2020-02-25T10:05:57" as datetime_start,  "2020-02-25T10:07:31" as datetime_end, 1 as flag uniuon all 
select "2020-02-25T10:07:31" as datetime_start,  "2020-02-25T10:10:22" as datetime_end, 1 as flag uniuon all 
select "2020-02-25T10:10:22" as datetime_start,  "2020-02-25T10:12:55" as datetime_end, 1 as flag uniuon all 
select "2020-02-25T10:12:55" as datetime_start,  "2020-02-25T10:20:17" as datetime_end, 1 as flag uniuon all 
select "2020-02-25T10:20:17" as datetime_start,  "2020-02-25T10:27:40" as datetime_end, 1 as flag uniuon all 
select "2020-02-25T10:27:40" as datetime_start,  "2020-02-25T10:39:51" as datetime_end, 1 as flag;

我正在寻找将根据标志列计算 activity 块的查询。
如果标志设置为 1,那么它之后的行直到标志更改为 0 需要组合成一个 activity 块。

以上示例产生 6 activity 个块。

  1. 2020-02-25T09:06:12 - 2020-02-25T09:07:31
  2. 2020-02-25T09:16:08 - 2020-02-25T09:17:31
  3. 2020-02-25T09:17:31 - 2020-02-25T09:32:41
  4. 2020-02-25T09:35:57 - 2020-02-25T09:37:31
  5. 2020-02-25T09:49:23 - 2020-02-25T09:51:16
  6. 2020-02-25T09:51:16 - 2020-02-25T10:39:51

这是一个缺口和孤岛的变体。这是一种使用 lag() 和 window 总和来定义连续 1 组的方法:

select
    min(datetime_start) datetime_stat,
    max(datetime_end) datetime_end,
    flag
from (
    select
        t.*,
        sum(case when flag = 1 and lag_flag = 1 then 0 else 1 end) 
            over(order by datetime_start) grp
    from (
        select 
            t.*,
            lag(flag) over(order by datetime_start) lag_flag
        from mytable t
    ) t
) t
group by flag, grp

这回答了问题的原始版本。

GMB 的答案可能有效,但它似乎是定制的,因为它硬编码了标志的值。我更喜欢更通用的方法:

with activity as (
    select "2020-02-25T09:06:12" as datetime_start,  "2020-02-25T09:07:31" as datetime_end, 0 as flag union all 
    select "2020-02-25T09:16:08" as datetime_start,  "2020-02-25T09:17:31" as datetime_end, 0 as flag union all 
    select "2020-02-25T09:17:31" as datetime_start,  "2020-02-25T09:27:31" as datetime_end, 1 as flag union all 
    select "2020-02-25T09:27:31" as datetime_start,  "2020-02-25T09:32:41" as datetime_end, 1 as flag union all 
    select "2020-02-25T09:35:57" as datetime_start,  "2020-02-25T09:37:31" as datetime_end, 0 as flag union all 
    select "2020-02-25T09:49:23" as datetime_start,  "2020-02-25T09:51:16" as datetime_end, 0 as flag union all 
    select "2020-02-25T09:51:16" as datetime_start,  "2020-02-25T10:03:46" as datetime_end, 1 as flag union all 
    select "2020-02-25T10:03:46" as datetime_start,  "2020-02-25T10:05:57" as datetime_end, 1 as flag union all 
    select "2020-02-25T10:05:57" as datetime_start,  "2020-02-25T10:07:31" as datetime_end, 1 as flag union all 
    select "2020-02-25T10:07:31" as datetime_start,  "2020-02-25T10:10:22" as datetime_end, 1 as flag union all 
    select "2020-02-25T10:10:22" as datetime_start,  "2020-02-25T10:12:55" as datetime_end, 1 as flag union all 
    select "2020-02-25T10:12:55" as datetime_start,  "2020-02-25T10:20:17" as datetime_end, 1 as flag union all 
    select "2020-02-25T10:20:17" as datetime_start,  "2020-02-25T10:27:40" as datetime_end, 1 as flag union all 
    select "2020-02-25T10:27:40" as datetime_start,  "2020-02-25T10:39:51" as datetime_end, 1 as flag
    )
select min(datetime_start) as datetime_stat,
       max(datetime_end) as datetime_end,
       flag
from (select a.*,
             countif( datetime_start <> prev_datetime_end OR
                      prev_flag <> flag
                    ) over (order by datetime_start) as grp
       from (select a.*,
                    lag(flag) over (order by datetime_start) as prev_flag,
                    lag(datetime_end) over (order by datetime_start) as prev_datetime_end
             from activity a
            ) a
) t
group by flag, grp