使用 LAG() 函数替代 BigQuery 中的 CONDITIONAL_TRUE_EVENT?
Alternative to CONDITIONAL_TRUE_EVENT in BigQuery, with a LAG() function?
Vertica 有一种非常好的操作类型:基于事件的 Window 操作,基本上可以让您识别事件何时发生。例如,每次给定的布尔表达式解析为真时,CONDITIONAL_TRUE_EVENT 都会增加一个计数器。
有什么方法可以使用 BigQuery 模拟此功能?请注意 CONDITIONAL_TRUE_EVENT 中有一个 LAG() 函数。
示例:
CONDITIONAL_TRUE_EVENT(timestamp - LAG(timestamp) > '7 days')
OVER(PARTITION BY zuid, sub_type ORDER BY timestamp)
谢谢!
这个问题我已经玩过几次了。
它实际上是关于嵌套两个查询来到达那里:
第一个查询(使用通用 Table 表达式)引入了一个计数器,当您所追求的条件为真时,该计数器为 1,否则为 0。
第二个查询,查询第一个查询的输出,创建该计数器的 运行 总和。
它比我在 BigQuery 版本下方显示的 Vertica 版本笨拙得多...
让我用我玩过的例子:带有时间戳和油压测量的传感器数据。我们想把我们只能识别的“行程”分开,因为“行程”之间有超过 30 分钟的间隔。
BigQuery 版本 - 它适用于所有支持 LAG() OLAP 函数的 DBMS ...
WITH
-- input ...
oilpressure(vid,ts,psi) AS (
SELECT 42,TIMESTAMP '2020-10-01 17:00:00', 25.356
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:10', 35.124
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:20', 47.056
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:30', 45.225
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:00', 25.356
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:10', 35.124
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:20', 47.056
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:30', 45.225
)
,
with_chg_counter AS (
SELECT
CASE WHEN ts - LAG(ts,1,'0000-01-01') OVER w > '30 MINUTES'
THEN 1
ELSE 0
END AS chg
, *
FROM oilpressure
WINDOW w AS (PARTITION BY vid ORDER BY ts)
)
SELECT
vid
, SUM(chg) OVER w AS tripid
, ts
, psi
FROM with_chg_counter
WINDOW w AS (PARTITION BY vid ORDER BY ts)
;
-- out vid|tripid|ts |psi
-- out 42| 1|2020-10-01 17:00:00|25.356
-- out 42| 1|2020-10-01 17:00:10|35.124
-- out 42| 1|2020-10-01 17:00:20|47.056
-- out 42| 1|2020-10-01 17:00:30|45.225
-- out 42| 2|2020-10-01 17:45:00|25.356
-- out 42| 2|2020-10-01 17:45:10|35.124
-- out 42| 2|2020-10-01 17:45:20|47.056
-- out 42| 2|2020-10-01 17:45:30|45.225
以及 Vertica 版本...
WITH
oilpressure(vid,ts,psi) AS (
SELECT 42,TIMESTAMP '2020-10-01 17:00:00', 25.356
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:10', 35.124
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:20', 47.056
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:30', 45.225
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:00', 25.356
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:10', 35.124
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:20', 47.056
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:30', 45.225
)
SELECT
vid
, CONDITIONAL_TRUE_EVENT(
ts - LAG(ts,1,'0000-01-01') > '30 MINUTES'
) OVER w AS tripid
, ts
, psi
FROM oilpressure
WINDOW w AS (PARTITION BY vid ORDER BY ts)
;
-- out vid | tripid | ts | psi
-- out -----+--------+---------------------+--------
-- out 42 | 1 | 2020-10-01 17:00:00 | 25.356
-- out 42 | 1 | 2020-10-01 17:00:10 | 35.124
-- out 42 | 1 | 2020-10-01 17:00:20 | 47.056
-- out 42 | 1 | 2020-10-01 17:00:30 | 45.225
-- out 42 | 2 | 2020-10-01 17:45:00 | 25.356
-- out 42 | 2 | 2020-10-01 17:45:10 | 35.124
-- out 42 | 2 | 2020-10-01 17:45:20 | 47.056
-- out 42 | 2 | 2020-10-01 17:45:30 | 45.225
以下适用于 BigQuery
select zuid, sub_type, timestamp,
countif(flag) over(partition by zuid, sub_type order by timestamp) as conditional_true_event
from (
select zuid, sub_type, timestamp,
date(timestamp) - 7 > lag(date(timestamp)) over(partition by zuid, sub_type order by timestamp) flag
from `project.dataset.table`
)
-- order by timestamp
Vertica 有一种非常好的操作类型:基于事件的 Window 操作,基本上可以让您识别事件何时发生。例如,每次给定的布尔表达式解析为真时,CONDITIONAL_TRUE_EVENT 都会增加一个计数器。
有什么方法可以使用 BigQuery 模拟此功能?请注意 CONDITIONAL_TRUE_EVENT 中有一个 LAG() 函数。
示例:
CONDITIONAL_TRUE_EVENT(timestamp - LAG(timestamp) > '7 days')
OVER(PARTITION BY zuid, sub_type ORDER BY timestamp)
谢谢!
这个问题我已经玩过几次了。
它实际上是关于嵌套两个查询来到达那里:
第一个查询(使用通用 Table 表达式)引入了一个计数器,当您所追求的条件为真时,该计数器为 1,否则为 0。 第二个查询,查询第一个查询的输出,创建该计数器的 运行 总和。
它比我在 BigQuery 版本下方显示的 Vertica 版本笨拙得多...
让我用我玩过的例子:带有时间戳和油压测量的传感器数据。我们想把我们只能识别的“行程”分开,因为“行程”之间有超过 30 分钟的间隔。
BigQuery 版本 - 它适用于所有支持 LAG() OLAP 函数的 DBMS ...
WITH
-- input ...
oilpressure(vid,ts,psi) AS (
SELECT 42,TIMESTAMP '2020-10-01 17:00:00', 25.356
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:10', 35.124
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:20', 47.056
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:30', 45.225
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:00', 25.356
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:10', 35.124
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:20', 47.056
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:30', 45.225
)
,
with_chg_counter AS (
SELECT
CASE WHEN ts - LAG(ts,1,'0000-01-01') OVER w > '30 MINUTES'
THEN 1
ELSE 0
END AS chg
, *
FROM oilpressure
WINDOW w AS (PARTITION BY vid ORDER BY ts)
)
SELECT
vid
, SUM(chg) OVER w AS tripid
, ts
, psi
FROM with_chg_counter
WINDOW w AS (PARTITION BY vid ORDER BY ts)
;
-- out vid|tripid|ts |psi
-- out 42| 1|2020-10-01 17:00:00|25.356
-- out 42| 1|2020-10-01 17:00:10|35.124
-- out 42| 1|2020-10-01 17:00:20|47.056
-- out 42| 1|2020-10-01 17:00:30|45.225
-- out 42| 2|2020-10-01 17:45:00|25.356
-- out 42| 2|2020-10-01 17:45:10|35.124
-- out 42| 2|2020-10-01 17:45:20|47.056
-- out 42| 2|2020-10-01 17:45:30|45.225
以及 Vertica 版本...
WITH
oilpressure(vid,ts,psi) AS (
SELECT 42,TIMESTAMP '2020-10-01 17:00:00', 25.356
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:10', 35.124
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:20', 47.056
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:00:30', 45.225
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:00', 25.356
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:10', 35.124
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:20', 47.056
UNION ALL SELECT 42,TIMESTAMP '2020-10-01 17:45:30', 45.225
)
SELECT
vid
, CONDITIONAL_TRUE_EVENT(
ts - LAG(ts,1,'0000-01-01') > '30 MINUTES'
) OVER w AS tripid
, ts
, psi
FROM oilpressure
WINDOW w AS (PARTITION BY vid ORDER BY ts)
;
-- out vid | tripid | ts | psi
-- out -----+--------+---------------------+--------
-- out 42 | 1 | 2020-10-01 17:00:00 | 25.356
-- out 42 | 1 | 2020-10-01 17:00:10 | 35.124
-- out 42 | 1 | 2020-10-01 17:00:20 | 47.056
-- out 42 | 1 | 2020-10-01 17:00:30 | 45.225
-- out 42 | 2 | 2020-10-01 17:45:00 | 25.356
-- out 42 | 2 | 2020-10-01 17:45:10 | 35.124
-- out 42 | 2 | 2020-10-01 17:45:20 | 47.056
-- out 42 | 2 | 2020-10-01 17:45:30 | 45.225
以下适用于 BigQuery
select zuid, sub_type, timestamp,
countif(flag) over(partition by zuid, sub_type order by timestamp) as conditional_true_event
from (
select zuid, sub_type, timestamp,
date(timestamp) - 7 > lag(date(timestamp)) over(partition by zuid, sub_type order by timestamp) flag
from `project.dataset.table`
)
-- order by timestamp