SQL 基于时间序列数据的查询
SQL query based on time series data
我有 2 个 table 如下所示:
create table parent_child
(
parent_id int not null,
child_id int not null
);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 273215);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 117936);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 117873);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 123305);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 240006);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 240005);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 239415);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 239414);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118310);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 130627);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 298564);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118311);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118312);
create table child
(
child_id int not null,
tstamp datetime not null,
value float,
);
-- Parent 117722
INSERT INTO child (child_id, tstamp, value) VALUES (273215, '2021-07-14 00:00:00.000000', 29);
INSERT INTO child (child_id, tstamp, value) VALUES (117936, '2021-07-14 00:00:00.000000', 52);
INSERT INTO child (child_id, tstamp, value) VALUES (117873, '2021-07-14 00:00:00.000000', 51);
INSERT INTO child (child_id, tstamp, value) VALUES (123305, '2021-07-14 00:00:00.000000', 31);
-- Parent 104151
INSERT INTO child (child_id, tstamp, value) VALUES (240006, '2021-07-14 00:00:00.000000', 37);
INSERT INTO child (child_id, tstamp, value) VALUES (240005, '2021-07-14 00:00:00.000000', 88);
INSERT INTO child (child_id, tstamp, value) VALUES (239415, '2021-07-14 00:00:00.000000', 29);
INSERT INTO child (child_id, tstamp, value) VALUES (239414, '2021-07-14 00:00:00.000000', 19);
-- Parent 5316
INSERT INTO child (child_id, tstamp, value) VALUES (118310, '2021-07-14 00:00:00.000000', 42);
INSERT INTO child (child_id, tstamp, value) VALUES (130627, '2021-07-14 00:00:00.000000', 11);
INSERT INTO child (child_id, tstamp, value) VALUES (298564, '2021-07-14 00:00:00.000000', 36);
INSERT INTO child (child_id, tstamp, value) VALUES (118311, '2021-07-14 00:00:00.000000', 22);
INSERT INTO child (child_id, tstamp, value) VALUES (118312, '2021-07-14 00:00:00.000000', 9);
-- Parent 117722
INSERT INTO child (child_id, tstamp, value) VALUES (273215, '2021-07-14 00:05:00.000000', 72);
INSERT INTO child (child_id, tstamp, value) VALUES (117936, '2021-07-14 00:05:00.000000', 99);
INSERT INTO child (child_id, tstamp, value) VALUES (117873, '2021-07-14 00:05:00.000000', 13);
INSERT INTO child (child_id, tstamp, value) VALUES (123305, '2021-07-14 00:05:00.000000', 24);
-- Parent 104151
INSERT INTO child (child_id, tstamp, value) VALUES (240006, '2021-07-14 00:05:00.000000', 65);
INSERT INTO child (child_id, tstamp, value) VALUES (240005, '2021-07-14 00:05:00.000000', 63);
INSERT INTO child (child_id, tstamp, value) VALUES (239415, '2021-07-14 00:05:00.000000', 23);
INSERT INTO child (child_id, tstamp, value) VALUES (239414, '2021-07-14 00:05:00.000000', 15);
-- Parent 5316
INSERT INTO child (child_id, tstamp, value) VALUES (118310, '2021-07-14 00:05:00.000000', 19);
INSERT INTO child (child_id, tstamp, value) VALUES (130627, '2021-07-14 00:05:00.000000', 22);
INSERT INTO child (child_id, tstamp, value) VALUES (298564, '2021-07-14 00:05:00.000000', 47);
INSERT INTO child (child_id, tstamp, value) VALUES (118311, '2021-07-14 00:05:00.000000', 54);
INSERT INTO child (child_id, tstamp, value) VALUES (118312, '2021-07-14 00:05:00.000000', 12);
child table 中的数据对每个 child 每 5 分钟重复一次。也就是说,对于 parent 的每个 child,将有 288 个数据点。这将在数据点处以不同(或相同)的值进一步重复每一天。
问题:
(1) 找出 parent_id
、date_when_count_of_value_above_30_more_than_12_times_in_a_day
、count_of_values_above_30_each_day
其中 parent 的所有 children 的值超过 30 超过 12每天几次,每周至少 3 天。数据点不必是连续的。换句话说,如果某个数据点(例如 2021-07-14 00:00:00.000000)的 parent 的所有 children 的 MAX(value)
大于 30,那么这将被计为该日期 parent 的一次出现。
(2) 找出 parent_id
、latest_datetime_in_the_week
、max_value_across_all_children_in_a_week
其中 parent 的所有 children 中的 value
是最大值和最大值的日期时间。如果多个日期时间具有相同的最大值,则选择最新的日期时间。
如果这两个查询可以合并为一个查询,那就是我想要的。否则,这些可以是 2 个不同的查询。如果是单个查询,那么问题 (2) 的输出将重复问题 (1) 的每一行,即 acceptable.
输入:
parent_id
的列表,例如,WHERE parent_id IN (117722, 5316)
示例输出(如果将 2 个查询合并为一个;列名可以是任何名称,为了简洁起见,我使用长名称):
parent_id | date_when_count_of_value_above_30_more_than_12_times_in_a_day | count_of_values_above_30_each_day | max_value_across_all_children_in_a_week | latest_datetime_in_the_week
117722 | 2021-07-14 | 13 | 99 | 2021-07-09 16:15:00.000000
117722 | 2021-07-11 | 28 | 99 | 2021-07-09 16:15:00.000000
104151 | 2021-07-14 | 19 | 65 | 2021-07-11 18:30:00.000000
104151 | 2021-07-13 | 27 | 65 | 2021-07-11 18:30:00.000000
104151 | 2021-07-11 | 36 | 65 | 2021-07-11 18:30:00.000000
以上只是一个示例输出。当然,如果一周内出现的次数不超过 3 次,parent_id 将不会出现在输出中。
对于 (1):
注意:为了便于阅读,我将 parentId 移到了子项中 table
WITH BYDAY AS
(
SELECT COUNT(*) as CountHiValsOnDay, parent_id, CAST(tstamp as date) as tsDate, DATEPART(week, CAST(tstamp as date)) as TheWeek, YEAR(CAST(tstamp as date)) as TheYear
FROM child
WHERE value > 30
GROUP BY parent_id, CAST(tstamp as date)
HAVING COUNT(*) > 12
)
SELECT
bd1.parent_id,
bd1.tsDate,
bd1.CountHiValsOnDay,
bd2.CountThisWeek
FROM BYDAY bd1
JOIN (
SELECT COUNT(*) as CountThisWeek, jj.parent_id, jj.TheWeek, jj.TheYear
FROM BYDAY jj
GROUP BY jj.parent_id, jj.TheWeek, jj.TheYear
) bd2
ON bd2.parent_id = bd1.parent_id AND
bd2.TheWeek = bd1.TheWeek AND
bd2.TheYear = bd1.TheYear
WHERE bd2.CountThisWeek >= 3
我在 Sybase 中对此进行了测试。您可以利用逻辑并转换特定的 Vertica。此外,对于一周,我从时间戳中获取一周中的年份。您可以根据您的要求修改它。像今天-7天还是window为主。
(1)
Select * from
(Select
p.parent_id as 'parent_id',
convert(varchar, c.tstamp, 101) as 'date_when_count_of_value_above_30_more_than_12_times_in_a_day',
count(*) as 'count_of_values_above_30_each_day '
from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
where
c.value > 30 and
p.parent_id in (5316, 117722, 104151)
group by
convert(varchar, c.tstamp, 101), p.parent_id
having count(*) > 12) tb1
group by datepart(cwk, tb1.date_when_count_of_value_above_30_more_than_12_times_in_a_day), parent_id
having count(*) >=3
(2)
Select
parent_id as 'parent_id',
MAX(tstamp) as 'latest_datetime_in_the_week',
value as 'max_value_across_all_children_in_a_week'
from
(Select
p.parent_id,c.tstamp,c.value from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
JOIN
(Select
MAX(c.value) as 'max_value'
from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
where
datepart(cwk, tstamp) = 28
group by
datepart(cwk, tstamp), p.parent_id) tbl1
ON c.value = tbl1.max_value) tbl2
group by parent_id, value
三选,全靠父id。并使用 Vertica 的分析限制子句: LIMIT 1 OVER()
WITH
w_p AS (
SELECT
parent_id
, child.*
FROM parent_child
JOIN child USING(child_id)
)
,
o_30_count AS (
SELECT
parent_id
, tstamp::DATE
, SUM(CASE WHEN val > 30 THEN 1 END) AS over_30_count
FROM w_p
GROUP BY 1,2
HAVING over_30_count > 12
)
,
max_per_week AS (
SELECT
parent_id
, WEEK(tstamp)
, MAX(val) AS max_per_week
FROM w_p
GROUP BY 1,2
LIMIT 1 OVER(PARTITION BY parent_id ORDER BY max_per_week DESC)
)
,
last_max_per_week AS (
SELECT
parent_id
, tstamp AS last_week_ts
FROM w_p
LIMIT 1 OVER(PARTITION BY parent_id,WEEK(tstamp) ORDER BY val DESC)
)
SELECT
o_30_count.parent_id
, over_30_count
, max_per_week
, last_week_ts
FROM o_30_count
JOIN max_per_week USING(parent_id)
JOIN last_max_per_week USING(parent_id);
我有 2 个 table 如下所示:
create table parent_child
(
parent_id int not null,
child_id int not null
);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 273215);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 117936);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 117873);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 123305);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 240006);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 240005);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 239415);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 239414);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118310);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 130627);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 298564);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118311);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118312);
create table child
(
child_id int not null,
tstamp datetime not null,
value float,
);
-- Parent 117722
INSERT INTO child (child_id, tstamp, value) VALUES (273215, '2021-07-14 00:00:00.000000', 29);
INSERT INTO child (child_id, tstamp, value) VALUES (117936, '2021-07-14 00:00:00.000000', 52);
INSERT INTO child (child_id, tstamp, value) VALUES (117873, '2021-07-14 00:00:00.000000', 51);
INSERT INTO child (child_id, tstamp, value) VALUES (123305, '2021-07-14 00:00:00.000000', 31);
-- Parent 104151
INSERT INTO child (child_id, tstamp, value) VALUES (240006, '2021-07-14 00:00:00.000000', 37);
INSERT INTO child (child_id, tstamp, value) VALUES (240005, '2021-07-14 00:00:00.000000', 88);
INSERT INTO child (child_id, tstamp, value) VALUES (239415, '2021-07-14 00:00:00.000000', 29);
INSERT INTO child (child_id, tstamp, value) VALUES (239414, '2021-07-14 00:00:00.000000', 19);
-- Parent 5316
INSERT INTO child (child_id, tstamp, value) VALUES (118310, '2021-07-14 00:00:00.000000', 42);
INSERT INTO child (child_id, tstamp, value) VALUES (130627, '2021-07-14 00:00:00.000000', 11);
INSERT INTO child (child_id, tstamp, value) VALUES (298564, '2021-07-14 00:00:00.000000', 36);
INSERT INTO child (child_id, tstamp, value) VALUES (118311, '2021-07-14 00:00:00.000000', 22);
INSERT INTO child (child_id, tstamp, value) VALUES (118312, '2021-07-14 00:00:00.000000', 9);
-- Parent 117722
INSERT INTO child (child_id, tstamp, value) VALUES (273215, '2021-07-14 00:05:00.000000', 72);
INSERT INTO child (child_id, tstamp, value) VALUES (117936, '2021-07-14 00:05:00.000000', 99);
INSERT INTO child (child_id, tstamp, value) VALUES (117873, '2021-07-14 00:05:00.000000', 13);
INSERT INTO child (child_id, tstamp, value) VALUES (123305, '2021-07-14 00:05:00.000000', 24);
-- Parent 104151
INSERT INTO child (child_id, tstamp, value) VALUES (240006, '2021-07-14 00:05:00.000000', 65);
INSERT INTO child (child_id, tstamp, value) VALUES (240005, '2021-07-14 00:05:00.000000', 63);
INSERT INTO child (child_id, tstamp, value) VALUES (239415, '2021-07-14 00:05:00.000000', 23);
INSERT INTO child (child_id, tstamp, value) VALUES (239414, '2021-07-14 00:05:00.000000', 15);
-- Parent 5316
INSERT INTO child (child_id, tstamp, value) VALUES (118310, '2021-07-14 00:05:00.000000', 19);
INSERT INTO child (child_id, tstamp, value) VALUES (130627, '2021-07-14 00:05:00.000000', 22);
INSERT INTO child (child_id, tstamp, value) VALUES (298564, '2021-07-14 00:05:00.000000', 47);
INSERT INTO child (child_id, tstamp, value) VALUES (118311, '2021-07-14 00:05:00.000000', 54);
INSERT INTO child (child_id, tstamp, value) VALUES (118312, '2021-07-14 00:05:00.000000', 12);
child table 中的数据对每个 child 每 5 分钟重复一次。也就是说,对于 parent 的每个 child,将有 288 个数据点。这将在数据点处以不同(或相同)的值进一步重复每一天。
问题:
(1) 找出 parent_id
、date_when_count_of_value_above_30_more_than_12_times_in_a_day
、count_of_values_above_30_each_day
其中 parent 的所有 children 的值超过 30 超过 12每天几次,每周至少 3 天。数据点不必是连续的。换句话说,如果某个数据点(例如 2021-07-14 00:00:00.000000)的 parent 的所有 children 的 MAX(value)
大于 30,那么这将被计为该日期 parent 的一次出现。
(2) 找出 parent_id
、latest_datetime_in_the_week
、max_value_across_all_children_in_a_week
其中 parent 的所有 children 中的 value
是最大值和最大值的日期时间。如果多个日期时间具有相同的最大值,则选择最新的日期时间。
如果这两个查询可以合并为一个查询,那就是我想要的。否则,这些可以是 2 个不同的查询。如果是单个查询,那么问题 (2) 的输出将重复问题 (1) 的每一行,即 acceptable.
输入:
parent_id
的列表,例如,WHERE parent_id IN (117722, 5316)
示例输出(如果将 2 个查询合并为一个;列名可以是任何名称,为了简洁起见,我使用长名称):
parent_id | date_when_count_of_value_above_30_more_than_12_times_in_a_day | count_of_values_above_30_each_day | max_value_across_all_children_in_a_week | latest_datetime_in_the_week
117722 | 2021-07-14 | 13 | 99 | 2021-07-09 16:15:00.000000
117722 | 2021-07-11 | 28 | 99 | 2021-07-09 16:15:00.000000
104151 | 2021-07-14 | 19 | 65 | 2021-07-11 18:30:00.000000
104151 | 2021-07-13 | 27 | 65 | 2021-07-11 18:30:00.000000
104151 | 2021-07-11 | 36 | 65 | 2021-07-11 18:30:00.000000
以上只是一个示例输出。当然,如果一周内出现的次数不超过 3 次,parent_id 将不会出现在输出中。
对于 (1):
注意:为了便于阅读,我将 parentId 移到了子项中 table
WITH BYDAY AS
(
SELECT COUNT(*) as CountHiValsOnDay, parent_id, CAST(tstamp as date) as tsDate, DATEPART(week, CAST(tstamp as date)) as TheWeek, YEAR(CAST(tstamp as date)) as TheYear
FROM child
WHERE value > 30
GROUP BY parent_id, CAST(tstamp as date)
HAVING COUNT(*) > 12
)
SELECT
bd1.parent_id,
bd1.tsDate,
bd1.CountHiValsOnDay,
bd2.CountThisWeek
FROM BYDAY bd1
JOIN (
SELECT COUNT(*) as CountThisWeek, jj.parent_id, jj.TheWeek, jj.TheYear
FROM BYDAY jj
GROUP BY jj.parent_id, jj.TheWeek, jj.TheYear
) bd2
ON bd2.parent_id = bd1.parent_id AND
bd2.TheWeek = bd1.TheWeek AND
bd2.TheYear = bd1.TheYear
WHERE bd2.CountThisWeek >= 3
我在 Sybase 中对此进行了测试。您可以利用逻辑并转换特定的 Vertica。此外,对于一周,我从时间戳中获取一周中的年份。您可以根据您的要求修改它。像今天-7天还是window为主。
(1)
Select * from
(Select
p.parent_id as 'parent_id',
convert(varchar, c.tstamp, 101) as 'date_when_count_of_value_above_30_more_than_12_times_in_a_day',
count(*) as 'count_of_values_above_30_each_day '
from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
where
c.value > 30 and
p.parent_id in (5316, 117722, 104151)
group by
convert(varchar, c.tstamp, 101), p.parent_id
having count(*) > 12) tb1
group by datepart(cwk, tb1.date_when_count_of_value_above_30_more_than_12_times_in_a_day), parent_id
having count(*) >=3
(2)
Select
parent_id as 'parent_id',
MAX(tstamp) as 'latest_datetime_in_the_week',
value as 'max_value_across_all_children_in_a_week'
from
(Select
p.parent_id,c.tstamp,c.value from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
JOIN
(Select
MAX(c.value) as 'max_value'
from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
where
datepart(cwk, tstamp) = 28
group by
datepart(cwk, tstamp), p.parent_id) tbl1
ON c.value = tbl1.max_value) tbl2
group by parent_id, value
三选,全靠父id。并使用 Vertica 的分析限制子句: LIMIT 1 OVER()
WITH
w_p AS (
SELECT
parent_id
, child.*
FROM parent_child
JOIN child USING(child_id)
)
,
o_30_count AS (
SELECT
parent_id
, tstamp::DATE
, SUM(CASE WHEN val > 30 THEN 1 END) AS over_30_count
FROM w_p
GROUP BY 1,2
HAVING over_30_count > 12
)
,
max_per_week AS (
SELECT
parent_id
, WEEK(tstamp)
, MAX(val) AS max_per_week
FROM w_p
GROUP BY 1,2
LIMIT 1 OVER(PARTITION BY parent_id ORDER BY max_per_week DESC)
)
,
last_max_per_week AS (
SELECT
parent_id
, tstamp AS last_week_ts
FROM w_p
LIMIT 1 OVER(PARTITION BY parent_id,WEEK(tstamp) ORDER BY val DESC)
)
SELECT
o_30_count.parent_id
, over_30_count
, max_per_week
, last_week_ts
FROM o_30_count
JOIN max_per_week USING(parent_id)
JOIN last_max_per_week USING(parent_id);