SQL 基于时间序列数据的查询

SQL query based on time series data

我有 2 个 table 如下所示:

create table parent_child
(
    parent_id int not null,
    child_id int not null
    
);

INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 273215);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 117936);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 117873);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 123305);

INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 240006);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 240005);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 239415);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 239414);

INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118310);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 130627);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 298564);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118311);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118312);
create table child
(
    child_id int not null,
    tstamp datetime not null,
    value float,
);

-- Parent 117722
INSERT INTO child (child_id, tstamp, value) VALUES (273215, '2021-07-14 00:00:00.000000',  29);
INSERT INTO child (child_id, tstamp, value) VALUES (117936, '2021-07-14 00:00:00.000000',  52);
INSERT INTO child (child_id, tstamp, value) VALUES (117873, '2021-07-14 00:00:00.000000',  51);
INSERT INTO child (child_id, tstamp, value) VALUES (123305, '2021-07-14 00:00:00.000000',  31);

-- Parent 104151
INSERT INTO child (child_id, tstamp, value) VALUES (240006, '2021-07-14 00:00:00.000000',  37);
INSERT INTO child (child_id, tstamp, value) VALUES (240005, '2021-07-14 00:00:00.000000',  88);
INSERT INTO child (child_id, tstamp, value) VALUES (239415, '2021-07-14 00:00:00.000000',  29);
INSERT INTO child (child_id, tstamp, value) VALUES (239414, '2021-07-14 00:00:00.000000',  19);

-- Parent 5316
INSERT INTO child (child_id, tstamp, value) VALUES (118310, '2021-07-14 00:00:00.000000',  42);
INSERT INTO child (child_id, tstamp, value) VALUES (130627, '2021-07-14 00:00:00.000000',  11);
INSERT INTO child (child_id, tstamp, value) VALUES (298564, '2021-07-14 00:00:00.000000',  36);
INSERT INTO child (child_id, tstamp, value) VALUES (118311, '2021-07-14 00:00:00.000000',  22);
INSERT INTO child (child_id, tstamp, value) VALUES (118312, '2021-07-14 00:00:00.000000',   9);

-- Parent 117722
INSERT INTO child (child_id, tstamp, value) VALUES (273215, '2021-07-14 00:05:00.000000',  72);
INSERT INTO child (child_id, tstamp, value) VALUES (117936, '2021-07-14 00:05:00.000000',  99);
INSERT INTO child (child_id, tstamp, value) VALUES (117873, '2021-07-14 00:05:00.000000',  13);
INSERT INTO child (child_id, tstamp, value) VALUES (123305, '2021-07-14 00:05:00.000000',  24);

-- Parent 104151
INSERT INTO child (child_id, tstamp, value) VALUES (240006, '2021-07-14 00:05:00.000000',  65);
INSERT INTO child (child_id, tstamp, value) VALUES (240005, '2021-07-14 00:05:00.000000',  63);
INSERT INTO child (child_id, tstamp, value) VALUES (239415, '2021-07-14 00:05:00.000000',  23);
INSERT INTO child (child_id, tstamp, value) VALUES (239414, '2021-07-14 00:05:00.000000',  15);

-- Parent 5316
INSERT INTO child (child_id, tstamp, value) VALUES (118310, '2021-07-14 00:05:00.000000',  19);
INSERT INTO child (child_id, tstamp, value) VALUES (130627, '2021-07-14 00:05:00.000000',  22);
INSERT INTO child (child_id, tstamp, value) VALUES (298564, '2021-07-14 00:05:00.000000',  47);
INSERT INTO child (child_id, tstamp, value) VALUES (118311, '2021-07-14 00:05:00.000000',  54);
INSERT INTO child (child_id, tstamp, value) VALUES (118312, '2021-07-14 00:05:00.000000',  12);

child table 中的数据对每个 child 每 5 分钟重复一次。也就是说,对于 parent 的每个 child,将有 288 个数据点。这将在数​​据点处以不同(或相同)的值进一步重复每一天。

问题:

(1) 找出 parent_iddate_when_count_of_value_above_30_more_than_12_times_in_a_daycount_of_values_above_30_each_day 其中 parent 的所有 children 的值超过 30 超过 12每天几次,每周至少 3 天。数据点不必是连续的。换句话说,如果某个数据点(例如 2021-07-14 00:00:00.000000)的 parent 的所有 children 的 MAX(value) 大于 30,那么这将被计为该日期 parent 的一次出现。

(2) 找出 parent_idlatest_datetime_in_the_weekmax_value_across_all_children_in_a_week 其中 parent 的所有 children 中的 value 是最大值和最大值的日期时间。如果多个日期时间具有相同的最大值,则选择最新的日期时间。

如果这两个查询可以合并为一个查询,那就是我想要的。否则,这些可以是 2 个不同的查询。如果是单个查询,那么问题 (2) 的输出将重复问题 (1) 的每一行,即 acceptable.

输入:

parent_id 的列表,例如,WHERE parent_id IN (117722, 5316)

示例输出(如果将 2 个查询合并为一个;列名可以是任何名称,为了简洁起见,我使用长名称):

parent_id | date_when_count_of_value_above_30_more_than_12_times_in_a_day | count_of_values_above_30_each_day | max_value_across_all_children_in_a_week | latest_datetime_in_the_week
117722    | 2021-07-14                                                    | 13                                | 99                                      | 2021-07-09 16:15:00.000000
117722    | 2021-07-11                                                    | 28                                | 99                                      | 2021-07-09 16:15:00.000000
104151    | 2021-07-14                                                    | 19                                | 65                                      | 2021-07-11 18:30:00.000000
104151    | 2021-07-13                                                    | 27                                | 65                                      | 2021-07-11 18:30:00.000000
104151    | 2021-07-11                                                    | 36                                | 65                                      | 2021-07-11 18:30:00.000000

以上只是一个示例输出。当然,如果一周内出现的次数不超过 3 次,parent_id 将不会出现在输出中。

对于 (1):

注意:为了便于阅读,我将 parentId 移到了子项中 table

WITH BYDAY AS
(
    SELECT COUNT(*) as CountHiValsOnDay, parent_id, CAST(tstamp as date) as tsDate, DATEPART(week, CAST(tstamp as date)) as TheWeek, YEAR(CAST(tstamp as date)) as TheYear
    FROM child
    WHERE value > 30
    GROUP BY parent_id, CAST(tstamp as date)
    HAVING COUNT(*) > 12
)
SELECT
    bd1.parent_id,
    bd1.tsDate,
    bd1.CountHiValsOnDay,
    bd2.CountThisWeek
FROM BYDAY bd1
    JOIN (
        SELECT COUNT(*) as CountThisWeek, jj.parent_id, jj.TheWeek, jj.TheYear
        FROM BYDAY jj
        GROUP BY jj.parent_id, jj.TheWeek, jj.TheYear
    ) bd2
      ON bd2.parent_id = bd1.parent_id AND
          bd2.TheWeek = bd1.TheWeek AND
          bd2.TheYear = bd1.TheYear
WHERE bd2.CountThisWeek >= 3

我在 Sybase 中对此进行了测试。您可以利用逻辑并转换特定的 Vertica。此外,对于一周,我从时间戳中获取一周中的年份。您可以根据您的要求修改它。像今天-7天还是window为主。

(1)

Select * from
(Select
p.parent_id                     as 'parent_id',
convert(varchar, c.tstamp, 101) as 'date_when_count_of_value_above_30_more_than_12_times_in_a_day',
count(*)                        as 'count_of_values_above_30_each_day '
from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
where 
c.value > 30 and
p.parent_id in (5316, 117722, 104151)
group by 
convert(varchar, c.tstamp, 101), p.parent_id
having count(*) > 12) tb1
group by datepart(cwk, tb1.date_when_count_of_value_above_30_more_than_12_times_in_a_day), parent_id
having count(*) >=3

(2)

Select
parent_id      as 'parent_id',
MAX(tstamp)    as 'latest_datetime_in_the_week',
value          as 'max_value_across_all_children_in_a_week' 
from 
(Select
p.parent_id,c.tstamp,c.value from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
JOIN 
(Select
MAX(c.value) as 'max_value'
from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
where
datepart(cwk, tstamp) = 28
group by 
datepart(cwk, tstamp), p.parent_id) tbl1
ON c.value = tbl1.max_value) tbl2
group by parent_id, value

三选,全靠父id。并使用 Vertica 的分析限制子句: LIMIT 1 OVER()

WITH
w_p AS (
  SELECT
    parent_id
  , child.*
  FROM parent_child
  JOIN child USING(child_id)
)
,
o_30_count AS (
  SELECT
    parent_id
  , tstamp::DATE
  , SUM(CASE WHEN val > 30 THEN 1 END) AS over_30_count
  FROM w_p 
  GROUP BY 1,2 
  HAVING over_30_count > 12
)
,
max_per_week AS (
  SELECT
    parent_id
  , WEEK(tstamp)
  , MAX(val) AS max_per_week
  FROM w_p
  GROUP BY 1,2
  LIMIT 1 OVER(PARTITION BY parent_id ORDER BY max_per_week DESC)
)
,
last_max_per_week AS (
  SELECT
    parent_id
  , tstamp AS last_week_ts
  FROM w_p
  LIMIT 1 OVER(PARTITION BY parent_id,WEEK(tstamp) ORDER BY val DESC)
)
SELECT
  o_30_count.parent_id
, over_30_count
, max_per_week
, last_week_ts
FROM o_30_count 
JOIN max_per_week USING(parent_id)
JOIN last_max_per_week USING(parent_id);