SQL 用于删除重复(连续)的记录,但在开始日期中存储最小日期,在结束日期中存储最大日期
SQL for deleting records which are duplicate(consecutive), But storing there min date in Start date and max date as End Date
我在示例中有以下输入数据 table:
S_ID C_ID E_ID ST_DT ED_DT
100 A 11AS 01/01/2020 05/01/2020
100 A 11AS 06/01/2020 10/01/2020
100 A 11AS 11/01/2020 15/01/2020
100 A 11BT 16/01/2020 20/01/2020
100 A 11AS 21/01/2020 27/01/2020
100 A 11AS 28/01/2020 30/01/2020
下面的预期输出 table:
S_ID C_ID E_ID ST_DT ED_DT
100 A 11AS 01/01/2020 15/01/2020
100 A 11BT 16/01/2020 20/01/2020
100 A 11AS 21/01/2020 30/01/2020
数据库:Netezza
注意:这些是来自数据的示例记录。 table 中还有其他 E_ID。
谢谢
这是一个 gaps-and-islands 问题。假设你没有间隙,一个简单的方法是行号的差异:
select s_id, c_id, e_id, min(st_dt), max(ed_dt)
from (select t.*,
row_number() over (partition by s_id, c_id order by st_dt) as seqnum,
row_number() over (partition by s_id, c_id, e_id order by st_dt) as seqnum_2
from t
) t
group by s_id, c_id, e_id, (seqnum - seqnum_2);
这实际上是一个 Gaps-and-islands 问题。 岛,正如@Gordon Linoff所说,在点击流分析和物联网数据中也被称为会话例如分析。
我会得到一个会话标识符,并在最后按它分组。
嵌套 full-SELECTs,每个包含不同的 OLAP 函数,以 GROUP BY 获得的会话 ID 结束,应该可以解决问题:
WITH
-- your input ...
input(s_id,c_id,e_id,st_dt,ed_dt) AS (
SELECT 100 ,'A' , '11AS',DATE '2020-01-01', DATE '2020-01-05'
UNION ALL SELECT 100 ,'A' , '11AS',DATE '2020-01-06', DATE '2020-01-10'
UNION ALL SELECT 100 ,'A' , '11AS',DATE '2020-01-11', DATE '2020-01-15'
UNION ALL SELECT 100 ,'A' , '11BT',DATE '2020-01-16', DATE '2020-01-20'
UNION ALL SELECT 100 ,'A' , '11AS',DATE '2020-01-21', DATE '2020-01-27'
UNION ALL SELECT 100 ,'A' , '11AS',DATE '2020-01-28', DATE '2020-01-30'
)
-- 添加一个更改“标志”整数,当 e_id
未更改时为 0,如果确实更改则为 1 ...
,
with_chg AS (
SELECT
*
, CASE WHEN NVL(LAG(e_id) OVER(ORDER BY st_dt),'') <> e_id THEN 1 ELSE 0 END AS chg
from input
)
-- SELECT * FROM with_chg; -- check query ....
-- out s_id | c_id | e_id | st_dt | ed_dt | chg
-- out ------+------+------+------------+------------+-----
-- out 100 | A | 11AS | 2020-01-01 | 2020-01-05 | 1
-- out 100 | A | 11AS | 2020-01-06 | 2020-01-10 | 0
-- out 100 | A | 11AS | 2020-01-11 | 2020-01-15 | 0
-- out 100 | A | 11BT | 2020-01-16 | 2020-01-20 | 1
-- out 100 | A | 11AS | 2020-01-21 | 2020-01-27 | 1
-- out 100 | A | 11AS | 2020-01-28 | 2020-01-30 | 0
-- 得到刚刚得到的列 chg
的 运行 和,你就有了会话标识符 ...
,
with_session AS (
SELECT
s_id
, c_id
, e_id
, st_dt
, ed_dt
, SUM(chg) OVER(ORDER BY st_dt) AS session
FROM with_chg
)
-- SELECT * FROM with_session; -- test query ...
-- out s_id | c_id | e_id | st_dt | ed_dt | session
-- out ------+------+------+------------+------------+---------
-- out 100 | A | 11AS | 2020-01-01 | 2020-01-05 | 1
-- out 100 | A | 11AS | 2020-01-06 | 2020-01-10 | 1
-- out 100 | A | 11AS | 2020-01-11 | 2020-01-15 | 1
-- out 100 | A | 11BT | 2020-01-16 | 2020-01-20 | 2
-- out 100 | A | 11AS | 2020-01-21 | 2020-01-27 | 3
-- out 100 | A | 11AS | 2020-01-28 | 2020-01-30 | 3
-- 现在,最后,GROUP BY s_id,c_id,e_id 和会话,得到 min(st_dt) 和 max(st_dt) ...
SELECT
s_id
, c_id
, e_id
, MIN(st_dt) AS st_dt
, MAX(ed_dt) AS ed_dt
FROM with_session
GROUP BY
s_id
, c_id
, e_id
, session
ORDER BY 4
;
-- out s_id | c_id | e_id | st_dt | ed_dt
-- out ------+------+------+------------+------------
-- out 100 | A | 11AS | 2020-01-01 | 2020-01-15
-- out 100 | A | 11BT | 2020-01-16 | 2020-01-20
-- out 100 | A | 11AS | 2020-01-21 | 2020-01-30
我在示例中有以下输入数据 table:
S_ID C_ID E_ID ST_DT ED_DT
100 A 11AS 01/01/2020 05/01/2020
100 A 11AS 06/01/2020 10/01/2020
100 A 11AS 11/01/2020 15/01/2020
100 A 11BT 16/01/2020 20/01/2020
100 A 11AS 21/01/2020 27/01/2020
100 A 11AS 28/01/2020 30/01/2020
下面的预期输出 table:
S_ID C_ID E_ID ST_DT ED_DT
100 A 11AS 01/01/2020 15/01/2020
100 A 11BT 16/01/2020 20/01/2020
100 A 11AS 21/01/2020 30/01/2020
数据库:Netezza 注意:这些是来自数据的示例记录。 table 中还有其他 E_ID。
谢谢
这是一个 gaps-and-islands 问题。假设你没有间隙,一个简单的方法是行号的差异:
select s_id, c_id, e_id, min(st_dt), max(ed_dt)
from (select t.*,
row_number() over (partition by s_id, c_id order by st_dt) as seqnum,
row_number() over (partition by s_id, c_id, e_id order by st_dt) as seqnum_2
from t
) t
group by s_id, c_id, e_id, (seqnum - seqnum_2);
这实际上是一个 Gaps-and-islands 问题。 岛,正如@Gordon Linoff所说,在点击流分析和物联网数据中也被称为会话例如分析。
我会得到一个会话标识符,并在最后按它分组。
嵌套 full-SELECTs,每个包含不同的 OLAP 函数,以 GROUP BY 获得的会话 ID 结束,应该可以解决问题:
WITH
-- your input ...
input(s_id,c_id,e_id,st_dt,ed_dt) AS (
SELECT 100 ,'A' , '11AS',DATE '2020-01-01', DATE '2020-01-05'
UNION ALL SELECT 100 ,'A' , '11AS',DATE '2020-01-06', DATE '2020-01-10'
UNION ALL SELECT 100 ,'A' , '11AS',DATE '2020-01-11', DATE '2020-01-15'
UNION ALL SELECT 100 ,'A' , '11BT',DATE '2020-01-16', DATE '2020-01-20'
UNION ALL SELECT 100 ,'A' , '11AS',DATE '2020-01-21', DATE '2020-01-27'
UNION ALL SELECT 100 ,'A' , '11AS',DATE '2020-01-28', DATE '2020-01-30'
)
-- 添加一个更改“标志”整数,当 e_id
未更改时为 0,如果确实更改则为 1 ...
,
with_chg AS (
SELECT
*
, CASE WHEN NVL(LAG(e_id) OVER(ORDER BY st_dt),'') <> e_id THEN 1 ELSE 0 END AS chg
from input
)
-- SELECT * FROM with_chg; -- check query ....
-- out s_id | c_id | e_id | st_dt | ed_dt | chg
-- out ------+------+------+------------+------------+-----
-- out 100 | A | 11AS | 2020-01-01 | 2020-01-05 | 1
-- out 100 | A | 11AS | 2020-01-06 | 2020-01-10 | 0
-- out 100 | A | 11AS | 2020-01-11 | 2020-01-15 | 0
-- out 100 | A | 11BT | 2020-01-16 | 2020-01-20 | 1
-- out 100 | A | 11AS | 2020-01-21 | 2020-01-27 | 1
-- out 100 | A | 11AS | 2020-01-28 | 2020-01-30 | 0
-- 得到刚刚得到的列 chg
的 运行 和,你就有了会话标识符 ...
,
with_session AS (
SELECT
s_id
, c_id
, e_id
, st_dt
, ed_dt
, SUM(chg) OVER(ORDER BY st_dt) AS session
FROM with_chg
)
-- SELECT * FROM with_session; -- test query ...
-- out s_id | c_id | e_id | st_dt | ed_dt | session
-- out ------+------+------+------------+------------+---------
-- out 100 | A | 11AS | 2020-01-01 | 2020-01-05 | 1
-- out 100 | A | 11AS | 2020-01-06 | 2020-01-10 | 1
-- out 100 | A | 11AS | 2020-01-11 | 2020-01-15 | 1
-- out 100 | A | 11BT | 2020-01-16 | 2020-01-20 | 2
-- out 100 | A | 11AS | 2020-01-21 | 2020-01-27 | 3
-- out 100 | A | 11AS | 2020-01-28 | 2020-01-30 | 3
-- 现在,最后,GROUP BY s_id,c_id,e_id 和会话,得到 min(st_dt) 和 max(st_dt) ...
SELECT
s_id
, c_id
, e_id
, MIN(st_dt) AS st_dt
, MAX(ed_dt) AS ed_dt
FROM with_session
GROUP BY
s_id
, c_id
, e_id
, session
ORDER BY 4
;
-- out s_id | c_id | e_id | st_dt | ed_dt
-- out ------+------+------+------------+------------
-- out 100 | A | 11AS | 2020-01-01 | 2020-01-15
-- out 100 | A | 11BT | 2020-01-16 | 2020-01-20
-- out 100 | A | 11AS | 2020-01-21 | 2020-01-30