如何编写 SQL 查询生成不同时间戳的列总和的历史记录?
How to write an SQL query generating a history of the sums of a column at different timestamps?
在 table 中,我有一些事件在不同的事件时间戳中显示具有不同 ID 的事物的值 (x
)。
WITH
events AS (
SELECT 1 AS thing_id, 1 AS x, TIMESTAMP('2021-12-01 00:00:00') AS event_timestamp
UNION ALL
SELECT 1, 3, TIMESTAMP('2021-12-01 00:01:00')
UNION ALL
SELECT 2, 5, TIMESTAMP('2021-12-01 00:02:00')
UNION ALL
SELECT 1, 2, TIMESTAMP('2021-12-01 00:03:00')
UNION ALL
SELECT 1, 0, TIMESTAMP('2021-12-01 00:04:00'))
SELECT
* FROM events
+----------+---+---------------------+
| thing_id | x | event_timestamp |
+----------+---+---------------------+
| 1 | 1 | 2021-12-01 00:00:00 |
| 1 | 3 | 2021-12-01 00:01:00 |
| 2 | 5 | 2021-12-01 00:02:00 |
| 1 | 2 | 2021-12-01 00:03:00 |
| 1 | 0 | 2021-12-01 00:04:00 |
+----------+---+---------------------+
我想获取历史记录,显示每个已知 event_timestamp
的所有当前 x
值的总和。
+---------------------+-------+
| state_timestamp | x_sum |
+---------------------+-------+
| 2021-12-01 00:00:00 | 1 |
| 2021-12-01 00:01:00 | 3 |
| 2021-12-01 00:02:00 | 8 |
| 2021-12-01 00:03:00 | 7 |
| 2021-12-01 00:04:00 | 5 |
+---------------------+-------+
所需的 SQL 语句是什么样的?我正在尝试考虑按 thing_id
分组、窗口化,以及可能在所有 event_timestamp
上交叉连接,但我想不通。
找到 greatest-n-per-group 问题的解释后,我想我设法写了一个解决方案。它在给定的最小示例中运行良好,但我不能 100% 确定它在一般情况下是否完美运行。 (我想它可以简化。)
WITH
events AS (
SELECT 1 AS thing_id, 1 AS x, TIMESTAMP('2021-12-01 00:00:00') AS event_timestamp
UNION ALL
SELECT 1, 3, TIMESTAMP('2021-12-01 00:01:00')
UNION ALL
SELECT 2, 5, TIMESTAMP('2021-12-01 00:02:00')
UNION ALL
SELECT 1, 2, TIMESTAMP('2021-12-01 00:03:00')
UNION ALL
SELECT 1, 0, TIMESTAMP('2021-12-01 00:04:00')
),
state_timestamps AS (
SELECT
event_timestamp state_timestamp
FROM
events),
states AS (
SELECT
thing_id,
state_timestamp,
event_timestamp,
x
FROM
events
CROSS JOIN
state_timestamps
WHERE
event_timestamp <= state_timestamp ),
latest_states AS (
SELECT
a.thing_id,
a.state_timestamp,
a.event_timestamp,
a.x
FROM
states a
INNER JOIN (
SELECT
thing_id,
state_timestamp,
MAX(event_timestamp) event_timestamp
FROM
states
GROUP BY
thing_id,
state_timestamp ) b
ON
a.thing_id = b.thing_id
AND a.event_timestamp = b.event_timestamp
AND a.state_timestamp = b.state_timestamp),
sum_states AS (
SELECT
state_timestamp,
SUM(x) x_sum
FROM
latest_states
GROUP BY
state_timestamp
ORDER BY
state_timestamp)
SELECT
*
FROM
sum_states
输出:
+---------------------+-------+
| state_timestamp | x_sum |
+---------------------+-------+
| 2021-12-01 00:00:00 | 1 |
| 2021-12-01 00:01:00 | 3 |
| 2021-12-01 00:02:00 | 8 |
| 2021-12-01 00:03:00 | 7 |
| 2021-12-01 00:04:00 | 5 |
+---------------------+-------+
考虑以下方法
select event_timestamp, sum(ifnull(coalesce(x, prev_x), 0)) x_sum
from (
select *,
first_value(x ignore nulls) over(partition by thing_id order by event_timestamp desc rows between 1 following and unbounded following) prev_x
from (
select *
from (select distinct thing_id from events)
cross join (select distinct event_timestamp from events)
left join events using (event_timestamp, thing_id)
)
)
group by event_timestamp
如果应用于您问题中的示例数据 - 输出为
在 table 中,我有一些事件在不同的事件时间戳中显示具有不同 ID 的事物的值 (x
)。
WITH
events AS (
SELECT 1 AS thing_id, 1 AS x, TIMESTAMP('2021-12-01 00:00:00') AS event_timestamp
UNION ALL
SELECT 1, 3, TIMESTAMP('2021-12-01 00:01:00')
UNION ALL
SELECT 2, 5, TIMESTAMP('2021-12-01 00:02:00')
UNION ALL
SELECT 1, 2, TIMESTAMP('2021-12-01 00:03:00')
UNION ALL
SELECT 1, 0, TIMESTAMP('2021-12-01 00:04:00'))
SELECT
* FROM events
+----------+---+---------------------+
| thing_id | x | event_timestamp |
+----------+---+---------------------+
| 1 | 1 | 2021-12-01 00:00:00 |
| 1 | 3 | 2021-12-01 00:01:00 |
| 2 | 5 | 2021-12-01 00:02:00 |
| 1 | 2 | 2021-12-01 00:03:00 |
| 1 | 0 | 2021-12-01 00:04:00 |
+----------+---+---------------------+
我想获取历史记录,显示每个已知 event_timestamp
的所有当前 x
值的总和。
+---------------------+-------+
| state_timestamp | x_sum |
+---------------------+-------+
| 2021-12-01 00:00:00 | 1 |
| 2021-12-01 00:01:00 | 3 |
| 2021-12-01 00:02:00 | 8 |
| 2021-12-01 00:03:00 | 7 |
| 2021-12-01 00:04:00 | 5 |
+---------------------+-------+
所需的 SQL 语句是什么样的?我正在尝试考虑按 thing_id
分组、窗口化,以及可能在所有 event_timestamp
上交叉连接,但我想不通。
找到 greatest-n-per-group 问题的解释后,我想我设法写了一个解决方案。它在给定的最小示例中运行良好,但我不能 100% 确定它在一般情况下是否完美运行。 (我想它可以简化。)
WITH
events AS (
SELECT 1 AS thing_id, 1 AS x, TIMESTAMP('2021-12-01 00:00:00') AS event_timestamp
UNION ALL
SELECT 1, 3, TIMESTAMP('2021-12-01 00:01:00')
UNION ALL
SELECT 2, 5, TIMESTAMP('2021-12-01 00:02:00')
UNION ALL
SELECT 1, 2, TIMESTAMP('2021-12-01 00:03:00')
UNION ALL
SELECT 1, 0, TIMESTAMP('2021-12-01 00:04:00')
),
state_timestamps AS (
SELECT
event_timestamp state_timestamp
FROM
events),
states AS (
SELECT
thing_id,
state_timestamp,
event_timestamp,
x
FROM
events
CROSS JOIN
state_timestamps
WHERE
event_timestamp <= state_timestamp ),
latest_states AS (
SELECT
a.thing_id,
a.state_timestamp,
a.event_timestamp,
a.x
FROM
states a
INNER JOIN (
SELECT
thing_id,
state_timestamp,
MAX(event_timestamp) event_timestamp
FROM
states
GROUP BY
thing_id,
state_timestamp ) b
ON
a.thing_id = b.thing_id
AND a.event_timestamp = b.event_timestamp
AND a.state_timestamp = b.state_timestamp),
sum_states AS (
SELECT
state_timestamp,
SUM(x) x_sum
FROM
latest_states
GROUP BY
state_timestamp
ORDER BY
state_timestamp)
SELECT
*
FROM
sum_states
输出:
+---------------------+-------+
| state_timestamp | x_sum |
+---------------------+-------+
| 2021-12-01 00:00:00 | 1 |
| 2021-12-01 00:01:00 | 3 |
| 2021-12-01 00:02:00 | 8 |
| 2021-12-01 00:03:00 | 7 |
| 2021-12-01 00:04:00 | 5 |
+---------------------+-------+
考虑以下方法
select event_timestamp, sum(ifnull(coalesce(x, prev_x), 0)) x_sum
from (
select *,
first_value(x ignore nulls) over(partition by thing_id order by event_timestamp desc rows between 1 following and unbounded following) prev_x
from (
select *
from (select distinct thing_id from events)
cross join (select distinct event_timestamp from events)
left join events using (event_timestamp, thing_id)
)
)
group by event_timestamp
如果应用于您问题中的示例数据 - 输出为