在 BigQuery 中使用 LEAD
Using LEAD in BigQuery
假设我的 table 结构是这样的
我计划按(USER 和 SEQUENCE)对其进行分组,并获取下一个序列的 LEAD 时间戳。这是我正在寻找的输出
如果可能的话,我可以不用 JOIN 使用 LEAD 函数来解决这个问题吗?
不确定bigquery,但一般SQL会写成:
select user, sequence, LEAD (max_timestamp,1) OVER (PARTITION BY user ORDER BY sequence) as timestamp
from (
select user, sequence, max(timestamp) as max_timestamp
from table
group by user, sequence) q1;
请注意保留字,例如 table、用户、时间戳等
编辑:是的,忘了这个答案吧,对所需的输出不够关注。米哈伊尔答对了!
以下适用于 BigQuery 标准 SQL
我将提供两个选项 - 使用 JOIN(只是为了证明我 understood/reversed-engineered 正确地预期了逻辑)然后 JOIN-less 版本(注意我使用 ts
作为字段名称而不是timestamp
)
Using JOIN
#standardSQL
SELECT a.user, a.sequence, MIN(b.ts) ts
FROM (
SELECT user, sequence, MAX(ts) AS max_ts
FROM `project.dataset.table`
GROUP BY user, sequence
) a
LEFT JOIN `project.dataset.table` b
ON a.user = b.user AND b.sequence = a.sequence + 1
WHERE a.max_ts <= IFNULL(b.ts, a.max_ts)
GROUP BY user, sequence
-- ORDER BY user, sequence
JOIN-less version
#standardSQL
SELECT
user, sequence,
(
SELECT ts FROM UNNEST(arr_ts) ts
WHERE max_ts < ts ORDER BY ts LIMIT 1
) ts
FROM (
SELECT
user, sequence, max_ts,
LEAD(arr_ts) OVER (PARTITION BY user ORDER BY sequence) arr_ts
FROM (
SELECT
user, sequence, MAX(ts) max_ts,
ARRAY_AGG(ts ORDER BY ts) arr_ts
FROM `project.dataset.table`
GROUP BY user, sequence
)
)
-- ORDER BY user, sequence
以上两个版本都可以使用以下虚拟数据进行测试/播放
WITH `project.dataset.table` AS (
SELECT 'user1' user, 2 sequence, 'T1' ts UNION ALL
SELECT 'user1', 2, 'T2' UNION ALL
SELECT 'user1', 1, 'T3' UNION ALL
SELECT 'user1', 1, 'T4' UNION ALL
SELECT 'user1', 3, 'T5' UNION ALL
SELECT 'user1', 2, 'T6' UNION ALL
SELECT 'user1', 3, 'T7' UNION ALL
SELECT 'user1', 3, 'T8'
)
并且 returns 都低于结果
user sequence ts
user1 1 T6
user1 2 T7
user1 3 null
假设我的 table 结构是这样的
我计划按(USER 和 SEQUENCE)对其进行分组,并获取下一个序列的 LEAD 时间戳。这是我正在寻找的输出
如果可能的话,我可以不用 JOIN 使用 LEAD 函数来解决这个问题吗?
不确定bigquery,但一般SQL会写成:
select user, sequence, LEAD (max_timestamp,1) OVER (PARTITION BY user ORDER BY sequence) as timestamp
from (
select user, sequence, max(timestamp) as max_timestamp
from table
group by user, sequence) q1;
请注意保留字,例如 table、用户、时间戳等
编辑:是的,忘了这个答案吧,对所需的输出不够关注。米哈伊尔答对了!
以下适用于 BigQuery 标准 SQL
我将提供两个选项 - 使用 JOIN(只是为了证明我 understood/reversed-engineered 正确地预期了逻辑)然后 JOIN-less 版本(注意我使用 ts
作为字段名称而不是timestamp
)
Using JOIN
#standardSQL
SELECT a.user, a.sequence, MIN(b.ts) ts
FROM (
SELECT user, sequence, MAX(ts) AS max_ts
FROM `project.dataset.table`
GROUP BY user, sequence
) a
LEFT JOIN `project.dataset.table` b
ON a.user = b.user AND b.sequence = a.sequence + 1
WHERE a.max_ts <= IFNULL(b.ts, a.max_ts)
GROUP BY user, sequence
-- ORDER BY user, sequence
JOIN-less version
#standardSQL
SELECT
user, sequence,
(
SELECT ts FROM UNNEST(arr_ts) ts
WHERE max_ts < ts ORDER BY ts LIMIT 1
) ts
FROM (
SELECT
user, sequence, max_ts,
LEAD(arr_ts) OVER (PARTITION BY user ORDER BY sequence) arr_ts
FROM (
SELECT
user, sequence, MAX(ts) max_ts,
ARRAY_AGG(ts ORDER BY ts) arr_ts
FROM `project.dataset.table`
GROUP BY user, sequence
)
)
-- ORDER BY user, sequence
以上两个版本都可以使用以下虚拟数据进行测试/播放
WITH `project.dataset.table` AS (
SELECT 'user1' user, 2 sequence, 'T1' ts UNION ALL
SELECT 'user1', 2, 'T2' UNION ALL
SELECT 'user1', 1, 'T3' UNION ALL
SELECT 'user1', 1, 'T4' UNION ALL
SELECT 'user1', 3, 'T5' UNION ALL
SELECT 'user1', 2, 'T6' UNION ALL
SELECT 'user1', 3, 'T7' UNION ALL
SELECT 'user1', 3, 'T8'
)
并且 returns 都低于结果
user sequence ts
user1 1 T6
user1 2 T7
user1 3 null