Sql 过滤掉重叠日期的查询
Sql query to filter out the overlapping dates
Version
start_date
end_date
1
2005-11-23
2005-11-23
2
2005-11-23
2005-11-23
3
2005-11-23
2008-10-23
4
2008-10-23
2010-05-18
5
2011-05-13
2012-05-19
在上面的 table 中,我们可以保留从“2005-11-23”到“2010-05-18”的版本 1,而不是保留版本 1、2、3、4,因为所有这些版本重叠并保持版本 5 不变。
需要输出
...............
Version
start_date
end_date
1
2005-11-23
2010-05-18
5
2011-05-13
2012-05-19
我们如何构建 sql 查询这个场景?
Hive 或 Postgresql
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2005-11-23'),
('3', '2005-11-23', '2008-10-23'),
('4', '2008-10-23', '2010-05-18'),
('5', '2011-05-13', '2012-05-19');
查询#1
with my_overlaps AS (
select
*,
LAG(end_date) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version") OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date,
CASE
WHEN md.end_date IS NULL THEN s.end_date
ELSE md.end_date
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version";
Version
start_date
end_date
1
2005-11-23T00:00:00.000Z
2010-05-18T00:00:00.000Z
5
2011-05-13T00:00:00.000Z
2012-05-19T00:00:00.000Z
模式(PostgreSQL v13)
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2005-11-23'),
('3', '2005-11-23', '2008-10-23'),
('4', '2008-10-23', '2010-05-18'),
('5', '2011-05-13', '2012-05-19');
查询#1
with my_overlaps AS (
select
*,
LAG(end_date) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version") OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date::text,
CASE
WHEN md.end_date IS NULL THEN s.end_date::text
ELSE md.end_date::text
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version";
Version
start_date
end_date
1
2005-11-23
2010-05-18
5
2011-05-13
2012-05-19
更新 1
Lag/Lead
函数现在分配了默认值
模式(PostgreSQL v13)
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2012-05-19');
查询#1
with my_overlaps AS (
select
*,
LAG(end_date,1,null) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version",1,3) OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date::text,
CASE
WHEN md.end_date IS NULL THEN s.end_date::text
ELSE md.end_date::text
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version";
ORDER BY
s."Version";
Version
start_date
end_date
1
2005-11-23
2012-05-19
有原始数据集
模式(PostgreSQL v13)
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2005-11-23'),
('3', '2005-11-23', '2008-10-23'),
('4', '2008-10-23', '2010-05-18'),
('5', '2011-05-13', '2012-05-19');
查询#1
with my_overlaps AS (
select
*,
LAG(end_date,1,null) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version",1,3) OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date::text,
CASE
WHEN md.end_date IS NULL THEN s.end_date::text
ELSE md.end_date::text
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version"
ORDER BY
s."Version";
Version
start_date
end_date
1
2005-11-23
2010-05-18
5
2011-05-13
2005-11-23
处理此问题的最安全方法——假设您可以对行创建稳定排序(version
提供)——使用累积最大值而不是 lag()
。
select min(version), min(start_date), min(end_date)
from (select t.*,
sum(case when prev_max_end_date >= start_date then 0 else 1 end) over
(order by start_date, version) as grp
from (select t.*,
max(end_date) over (order by start_date, version
rows between unbounded preceding and 1 preceding
) as prev_max_end_date
from t
) t
) t
group by grp;
这应该适用于任何(合理的)数据库。 Here 是一个恰好使用 Postgres 的 db<>fiddle。
lag()
/lead()
方法的问题在于,与前面几行的重叠可能不在“前一”行上。例如,考虑下图(小写表示开始,大写表示结束):
---a----b--B----c--C----d--D--e---A--E--
E
与 A
重叠。但是,根据“前一个”的任何合理定义,A
不是 E
.
的前一行
Version | start_date | end_date |
---|---|---|
1 | 2005-11-23 | 2005-11-23 |
2 | 2005-11-23 | 2005-11-23 |
3 | 2005-11-23 | 2008-10-23 |
4 | 2008-10-23 | 2010-05-18 |
5 | 2011-05-13 | 2012-05-19 |
在上面的 table 中,我们可以保留从“2005-11-23”到“2010-05-18”的版本 1,而不是保留版本 1、2、3、4,因为所有这些版本重叠并保持版本 5 不变。
需要输出 ...............
Version | start_date | end_date |
---|---|---|
1 | 2005-11-23 | 2010-05-18 |
5 | 2011-05-13 | 2012-05-19 |
我们如何构建 sql 查询这个场景?
Hive 或 Postgresql
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2005-11-23'),
('3', '2005-11-23', '2008-10-23'),
('4', '2008-10-23', '2010-05-18'),
('5', '2011-05-13', '2012-05-19');
查询#1
with my_overlaps AS (
select
*,
LAG(end_date) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version") OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date,
CASE
WHEN md.end_date IS NULL THEN s.end_date
ELSE md.end_date
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version";
Version | start_date | end_date |
---|---|---|
1 | 2005-11-23T00:00:00.000Z | 2010-05-18T00:00:00.000Z |
5 | 2011-05-13T00:00:00.000Z | 2012-05-19T00:00:00.000Z |
模式(PostgreSQL v13)
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2005-11-23'),
('3', '2005-11-23', '2008-10-23'),
('4', '2008-10-23', '2010-05-18'),
('5', '2011-05-13', '2012-05-19');
查询#1
with my_overlaps AS (
select
*,
LAG(end_date) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version") OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date::text,
CASE
WHEN md.end_date IS NULL THEN s.end_date::text
ELSE md.end_date::text
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version";
Version | start_date | end_date |
---|---|---|
1 | 2005-11-23 | 2010-05-18 |
5 | 2011-05-13 | 2012-05-19 |
更新 1
Lag/Lead
函数现在分配了默认值
模式(PostgreSQL v13)
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2012-05-19');
查询#1
with my_overlaps AS (
select
*,
LAG(end_date,1,null) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version",1,3) OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date::text,
CASE
WHEN md.end_date IS NULL THEN s.end_date::text
ELSE md.end_date::text
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version";
ORDER BY
s."Version";
Version | start_date | end_date |
---|---|---|
1 | 2005-11-23 | 2012-05-19 |
有原始数据集 模式(PostgreSQL v13)
CREATE TABLE my_dates (
"Version" INTEGER,
start_date date,
end_date date
);
INSERT INTO my_dates
("Version",start_date, end_date)
VALUES
('1', '2005-11-23', '2005-11-23'),
('2', '2005-11-23', '2005-11-23'),
('3', '2005-11-23', '2008-10-23'),
('4', '2008-10-23', '2010-05-18'),
('5', '2011-05-13', '2012-05-19');
查询#1
with my_overlaps AS (
select
*,
LAG(end_date,1,null) OVER (ORDER BY "Version") >= start_date as overlap
from my_dates
),
selected AS (
SELECT
"Version",
start_date,
end_date ,
LEAD("Version",1,3) OVER (ORDER BY "Version") AS next_version
FROM
my_overlaps
where overlap=false or
overlap is null
)
select
s."Version",
s.start_date::text,
CASE
WHEN md.end_date IS NULL THEN s.end_date::text
ELSE md.end_date::text
END as end_date
FROM
selected s
LEFT JOIN
my_dates md on s.next_version -1 = md."Version"
ORDER BY
s."Version";
Version | start_date | end_date |
---|---|---|
1 | 2005-11-23 | 2010-05-18 |
5 | 2011-05-13 | 2005-11-23 |
处理此问题的最安全方法——假设您可以对行创建稳定排序(version
提供)——使用累积最大值而不是 lag()
。
select min(version), min(start_date), min(end_date)
from (select t.*,
sum(case when prev_max_end_date >= start_date then 0 else 1 end) over
(order by start_date, version) as grp
from (select t.*,
max(end_date) over (order by start_date, version
rows between unbounded preceding and 1 preceding
) as prev_max_end_date
from t
) t
) t
group by grp;
这应该适用于任何(合理的)数据库。 Here 是一个恰好使用 Postgres 的 db<>fiddle。
lag()
/lead()
方法的问题在于,与前面几行的重叠可能不在“前一”行上。例如,考虑下图(小写表示开始,大写表示结束):
---a----b--B----c--C----d--D--e---A--E--
E
与 A
重叠。但是,根据“前一个”的任何合理定义,A
不是 E
.