Sql 过滤掉重叠日期的查询

Sql query to filter out the overlapping dates

Version start_date end_date
1 2005-11-23 2005-11-23
2 2005-11-23 2005-11-23
3 2005-11-23 2008-10-23
4 2008-10-23 2010-05-18
5 2011-05-13 2012-05-19

在上面的 table 中,我们可以保留从“2005-11-23”到“2010-05-18”的版本 1,而不是保留版本 1、2、3、4,因为所有这些版本重叠并保持版本 5 不变。

需要输出 ...............

Version start_date end_date
1 2005-11-23 2010-05-18
5 2011-05-13 2012-05-19

我们如何构建 sql 查询这个场景?

Hive 或 Postgresql

CREATE TABLE my_dates (
  "Version" INTEGER,
  start_date date,
  end_date date
);

INSERT INTO my_dates
  ("Version",start_date, end_date)
VALUES
  ('1', '2005-11-23', '2005-11-23'),
  ('2', '2005-11-23', '2005-11-23'),
  ('3', '2005-11-23', '2008-10-23'),
  ('4', '2008-10-23', '2010-05-18'),
  ('5', '2011-05-13', '2012-05-19');

查询#1

with my_overlaps AS (
select 
   *,
   LAG(end_date) OVER (ORDER BY "Version") >= start_date as  overlap
from my_dates
),
selected AS (
SELECT 
    "Version",
    start_date, 
    end_date ,
    LEAD("Version") OVER (ORDER BY "Version") AS next_version 
FROM 
    my_overlaps 
where overlap=false or 
      overlap is null 
)
select 
   s."Version",
   s.start_date, 
   CASE
       WHEN md.end_date IS NULL THEN s.end_date
       ELSE md.end_date
   END as end_date
FROM
   selected s
LEFT JOIN
   my_dates md on s.next_version -1 = md."Version";
Version start_date end_date
1 2005-11-23T00:00:00.000Z 2010-05-18T00:00:00.000Z
5 2011-05-13T00:00:00.000Z 2012-05-19T00:00:00.000Z

View on DB Fiddle

模式(PostgreSQL v13)

CREATE TABLE my_dates (
  "Version" INTEGER,
  start_date date,
  end_date date
);

INSERT INTO my_dates
  ("Version",start_date, end_date)
VALUES
  ('1', '2005-11-23', '2005-11-23'),
  ('2', '2005-11-23', '2005-11-23'),
  ('3', '2005-11-23', '2008-10-23'),
  ('4', '2008-10-23', '2010-05-18'),
  ('5', '2011-05-13', '2012-05-19');

查询#1

with my_overlaps AS (
select 
   *,
   LAG(end_date) OVER (ORDER BY "Version") >= start_date as  overlap
from my_dates
),
selected AS (
SELECT 
    "Version",
    start_date, 
    end_date ,
    LEAD("Version") OVER (ORDER BY "Version") AS next_version 
FROM 
    my_overlaps 
where overlap=false or 
      overlap is null 
)
select 
   s."Version",
   s.start_date::text, 
   CASE
       WHEN md.end_date IS NULL THEN s.end_date::text
       ELSE md.end_date::text
   END as end_date
FROM
   selected s
LEFT JOIN
   my_dates md on s.next_version -1 = md."Version";
Version start_date end_date
1 2005-11-23 2010-05-18
5 2011-05-13 2012-05-19

View on DB Fiddle

更新 1

Lag/Lead 函数现在分配了默认值

模式(PostgreSQL v13)

CREATE TABLE my_dates (
  "Version" INTEGER,
  start_date date,
  end_date date
);

INSERT INTO my_dates
  ("Version",start_date, end_date)
VALUES
  ('1', '2005-11-23', '2005-11-23'),
  ('2', '2005-11-23', '2012-05-19');

查询#1

with my_overlaps AS (
select 
   *,
   LAG(end_date,1,null) OVER (ORDER BY "Version") >= start_date as  overlap
from my_dates
),
selected AS (
SELECT 
    "Version",
    start_date, 
    end_date ,
    LEAD("Version",1,3) OVER (ORDER BY "Version") AS next_version 
FROM 
    my_overlaps 
where overlap=false or 
      overlap is null 
)
select 
   s."Version",
   s.start_date::text, 
   CASE
       WHEN md.end_date IS NULL THEN s.end_date::text
       ELSE md.end_date::text
   END as end_date
FROM
   selected s
LEFT JOIN
   my_dates md on s.next_version -1 = md."Version";
ORDER BY
   s."Version";
Version start_date end_date
1 2005-11-23 2012-05-19

View on DB Fiddle

有原始数据集 模式(PostgreSQL v13)

CREATE TABLE my_dates (
  "Version" INTEGER,
  start_date date,
  end_date date
);

INSERT INTO my_dates
  ("Version",start_date, end_date)
VALUES
  ('1', '2005-11-23', '2005-11-23'),
  ('2', '2005-11-23', '2005-11-23'),
  ('3', '2005-11-23', '2008-10-23'),
  ('4', '2008-10-23', '2010-05-18'),
  ('5', '2011-05-13', '2012-05-19');

查询#1

with my_overlaps AS (
select 
   *,
   LAG(end_date,1,null) OVER (ORDER BY "Version") >= start_date as  overlap
from my_dates
),
selected AS (
SELECT 
    "Version",
    start_date, 
    end_date ,
    LEAD("Version",1,3) OVER (ORDER BY "Version") AS next_version 
FROM 
    my_overlaps 
where overlap=false or 
      overlap is null 
)
select 
   s."Version",
   s.start_date::text, 
   CASE
       WHEN md.end_date IS NULL THEN s.end_date::text
       ELSE md.end_date::text
   END as end_date
FROM
   selected s
LEFT JOIN
   my_dates md on s.next_version -1 = md."Version"
ORDER BY
   s."Version";
Version start_date end_date
1 2005-11-23 2010-05-18
5 2011-05-13 2005-11-23

View on DB Fiddle

处理此问题的最安全方法——假设您可以对行创建稳定排序(version 提供)——使用累积最大值而不是 lag()

select min(version), min(start_date), min(end_date)
from (select t.*,
             sum(case when prev_max_end_date >= start_date then 0 else 1 end) over
                 (order by start_date, version) as grp
      from (select t.*,
                   max(end_date) over (order by start_date, version
                                       rows between unbounded preceding and 1 preceding
                                      ) as prev_max_end_date
            from t
           ) t
     ) t
group by grp;

这应该适用于任何(合理的)数据库。 Here 是一个恰好使用 Postgres 的 db<>fiddle。

lag()/lead() 方法的问题在于,与前面几行的重叠可能不在“前一”行上。例如,考虑下图(小写表示开始,大写表示结束):

---a----b--B----c--C----d--D--e---A--E--

EA 重叠。但是,根据“前一个”的任何合理定义,A 不是 E.

的前一行