如何根据最小日期间隔合并间隔?
How to merge intervals based on a minimum gap of dates?
我现在的table是这样的,每个病人到医院都有他们的就诊开始日期和结束日期,他们在admin_startdate和admin_enddate之间服用药物。例如,前两行表示,患者 PT1 在 01/01 至 01/31 就诊期间有两次给药,一次在 01/08 和 01/10 之间,另一个在 01/12 和 01/23 之间。 =15=]
ptid visit_start_date visit_end_date admin_startdate admin_enddate
PT1 2018-01-01 2018-01-31 2018-01-08 2018-01-10
PT1 2018-01-01 2018-01-31 2018-01-12 2018-01-23
PT2 2018-01-02 2018-01-18 2018-01-06 2018-01-11
PT2 2018-01-02 2018-01-18 2018-01-14 2018-01-17
我想实现的是把太靠近的药管放在一起,比如说,前一个的结束日期是新的开始日期的<= 2 days
,并称之为一整集,如下所示:
ptid visit_start_date visit_end_date admin_startdate admin_enddate episode_startdate episode_enddate
PT1 2018-01-01 2018-01-31 2018-01-08 2018-01-10 2018-01-08 2018-01-23
PT1 2018-01-01 2018-01-31 2018-01-12 2018-01-23 2018-01-08 2018-01-23
PT2 2018-01-02 2018-01-18 2018-01-06 2018-01-11 2018-01-06 2018-01-11
PT2 2018-01-02 2018-01-18 2018-01-14 2018-01-17 2018-01-14 2018-01-17
你可以看到PT1的两届政府被归为同一episode_startdate
和episode_enddate
,而PT2的两届政府被认为是两个独立的剧集。
我很难弄清楚如何在 PostgreSQL (Redshift) 中做到这一点。
这在 Postgres 14 中有效。未针对 Redshift 进行测试。
SELECT ptid, visit_start_date, visit_end_date, admin_startdate, admin_enddate
, min(admin_startdate) OVER (PARTITION BY visit_id, admin) AS episode_startdate
, max(admin_enddate) OVER (PARTITION BY visit_id, admin) AS episode_enddate
FROM (
SELECT *, count(*) FILTER (WHERE gap) OVER (PARTITION BY visit_id ORDER BY admin_startdate) AS admin
FROM (
SELECT *, admin_startdate - lag(admin_enddate) OVER (PARTITION BY visit_id ORDER BY admin_startdate) > 2 AS gap
FROM (
SELECT *, dense_rank() OVER (ORDER BY ptid, visit_start_date, visit_end_date) AS visit_id -- optional, to simplify
FROM tbl
) sub1
) sub2
) sub3
db<>fiddle here
最里面的子查询 sub1
只是计算一个唯一的 visit_id
- 它应该真正在您的 table 中,而不是一遍又一遍地重复 (ptid, visit_start_date, visit_end_date )
。至少考虑 normalizing 你的设计。
下一个子查询 sub2
检查同一分区中与上一行之间是否存在大于两天的间隔。
子查询 sub3
然后计算这些间隔以识别不同的管理周期 (admin
)
在外部 SELECT
、min(admin_startdate)
和 max(admin_enddate)
中,每个给药周期产生所需的发作日期。
查看(包含指向更多内容的各种链接):
CREATE TABLE tb1 AS (
SELECT *, admin_startdate - lag(admin_enddate) OVER (PARTITION BY visit_id ORDER BY admin_startdate) > 2 AS gap
FROM (
SELECT *, dense_rank() OVER (ORDER BY ptid, visit_start_date, visit_end_date) AS visit_id -- optional, to simplify
FROM tbl
) sub1
) ;
CREATE TABLE tb2 AS (
SELECT *, count(*) OVER (PARTITION BY visit_id ORDER BY admin_startdate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS admin
FROM tb1
WHERE gap is True
)
;
CREATE TABLE tb3 AS (
SELECT tb1.ptid, tb1.visit_start_date, tb1.visit_end_date, tb1.admin_startdate, tb1.admin_enddate, tb1.visit_id, tb1.gap,
CASE WHEN tb2.admin is NULL THEN 0 else tb2.admin END AS admin
FROM tb1
LEFT JOIN tb2
ON tb1.ptid = tb2.ptid AND tb1.visit_start_date = tb2.visit_start_date AND tb1.visit_end_date = tb2.visit_end_date AND tb1.admin_startdate = tb2.admin_startdate AND tb1.admin_enddate = tb2.admin_enddate AND tb1.visit_id = tb2.visit_id
)
;
CREATE TABLE tb4 AS (
SELECT ptid, visit_start_date, visit_end_date, admin_startdate, admin_enddate
, min(admin_startdate) OVER (PARTITION BY visit_id, admin) AS episode_startdate
, max(admin_enddate) OVER (PARTITION BY visit_id, admin) AS episode_enddate
FROM tb3
)
这是根据Erwin对Redshift的回答改编的比较丑的版本,不支持FILTER操作。至少在 db fiddle
上正确测试
我现在的table是这样的,每个病人到医院都有他们的就诊开始日期和结束日期,他们在admin_startdate和admin_enddate之间服用药物。例如,前两行表示,患者 PT1 在 01/01 至 01/31 就诊期间有两次给药,一次在 01/08 和 01/10 之间,另一个在 01/12 和 01/23 之间。 =15=]
ptid visit_start_date visit_end_date admin_startdate admin_enddate
PT1 2018-01-01 2018-01-31 2018-01-08 2018-01-10
PT1 2018-01-01 2018-01-31 2018-01-12 2018-01-23
PT2 2018-01-02 2018-01-18 2018-01-06 2018-01-11
PT2 2018-01-02 2018-01-18 2018-01-14 2018-01-17
我想实现的是把太靠近的药管放在一起,比如说,前一个的结束日期是新的开始日期的<= 2 days
,并称之为一整集,如下所示:
ptid visit_start_date visit_end_date admin_startdate admin_enddate episode_startdate episode_enddate
PT1 2018-01-01 2018-01-31 2018-01-08 2018-01-10 2018-01-08 2018-01-23
PT1 2018-01-01 2018-01-31 2018-01-12 2018-01-23 2018-01-08 2018-01-23
PT2 2018-01-02 2018-01-18 2018-01-06 2018-01-11 2018-01-06 2018-01-11
PT2 2018-01-02 2018-01-18 2018-01-14 2018-01-17 2018-01-14 2018-01-17
你可以看到PT1的两届政府被归为同一episode_startdate
和episode_enddate
,而PT2的两届政府被认为是两个独立的剧集。
我很难弄清楚如何在 PostgreSQL (Redshift) 中做到这一点。
这在 Postgres 14 中有效。未针对 Redshift 进行测试。
SELECT ptid, visit_start_date, visit_end_date, admin_startdate, admin_enddate
, min(admin_startdate) OVER (PARTITION BY visit_id, admin) AS episode_startdate
, max(admin_enddate) OVER (PARTITION BY visit_id, admin) AS episode_enddate
FROM (
SELECT *, count(*) FILTER (WHERE gap) OVER (PARTITION BY visit_id ORDER BY admin_startdate) AS admin
FROM (
SELECT *, admin_startdate - lag(admin_enddate) OVER (PARTITION BY visit_id ORDER BY admin_startdate) > 2 AS gap
FROM (
SELECT *, dense_rank() OVER (ORDER BY ptid, visit_start_date, visit_end_date) AS visit_id -- optional, to simplify
FROM tbl
) sub1
) sub2
) sub3
db<>fiddle here
最里面的子查询 sub1
只是计算一个唯一的 visit_id
- 它应该真正在您的 table 中,而不是一遍又一遍地重复 (ptid, visit_start_date, visit_end_date )
。至少考虑 normalizing 你的设计。
下一个子查询 sub2
检查同一分区中与上一行之间是否存在大于两天的间隔。
子查询 sub3
然后计算这些间隔以识别不同的管理周期 (admin
)
在外部 SELECT
、min(admin_startdate)
和 max(admin_enddate)
中,每个给药周期产生所需的发作日期。
查看(包含指向更多内容的各种链接):
CREATE TABLE tb1 AS (
SELECT *, admin_startdate - lag(admin_enddate) OVER (PARTITION BY visit_id ORDER BY admin_startdate) > 2 AS gap
FROM (
SELECT *, dense_rank() OVER (ORDER BY ptid, visit_start_date, visit_end_date) AS visit_id -- optional, to simplify
FROM tbl
) sub1
) ;
CREATE TABLE tb2 AS (
SELECT *, count(*) OVER (PARTITION BY visit_id ORDER BY admin_startdate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS admin
FROM tb1
WHERE gap is True
)
;
CREATE TABLE tb3 AS (
SELECT tb1.ptid, tb1.visit_start_date, tb1.visit_end_date, tb1.admin_startdate, tb1.admin_enddate, tb1.visit_id, tb1.gap,
CASE WHEN tb2.admin is NULL THEN 0 else tb2.admin END AS admin
FROM tb1
LEFT JOIN tb2
ON tb1.ptid = tb2.ptid AND tb1.visit_start_date = tb2.visit_start_date AND tb1.visit_end_date = tb2.visit_end_date AND tb1.admin_startdate = tb2.admin_startdate AND tb1.admin_enddate = tb2.admin_enddate AND tb1.visit_id = tb2.visit_id
)
;
CREATE TABLE tb4 AS (
SELECT ptid, visit_start_date, visit_end_date, admin_startdate, admin_enddate
, min(admin_startdate) OVER (PARTITION BY visit_id, admin) AS episode_startdate
, max(admin_enddate) OVER (PARTITION BY visit_id, admin) AS episode_enddate
FROM tb3
)
这是根据Erwin对Redshift的回答改编的比较丑的版本,不支持FILTER操作。至少在 db fiddle
上正确测试