Microsoft SQL Server 2016,T-SQL:根据各个日期获取数据集的日期范围
Microsoft SQL Server 2016, T-SQL : obtain date range for a dataset based on individual dates
我在 SQL Server 2016 中有一个有趣的情况。我正在使用 T-SQL 语言。
我有一个名为 (#dataset) 的数据集:
名为 ContinuousDates 的最后一列将始终具有连续的日期值,例如 2021 年 1 月 1 日至 2021 年 12 月 31 日。它永远不会有相同 ID 或姓名的重复日期,即给定的一个人一天只能有一行数据。 (在这个例子中,我只显示了一个人,ID = 1 和 Name = X。在我的实际数据中,我有多个人)。
请注意,纽约市出现在数据集中较早的位置,并在最后 4 行重复出现。
我需要根据日期范围获取以下数据集:
我尝试在数据集上使用简单的 MINIMUM 和 MAXIMUM,但我意识到有时我会得到 错误的 输出,如下所示:
我使用 RANK() 和 DENSE_RANK() 函数尝试了一些选项,但无法找到解决方案。有人可以为我提供帮助吗?
我在这里附上代码:
CREATE TABLE #dataset
(
ID int,
Name varchar(20),
City varchar(20),
ContinuousDates date
)
INSERT INTO #dataset
VALUES(1,'X','NYC','1/1/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/2/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/3/2021')
INSERT INTO #dataset
VALUES(1,'X','SFO','1/4/2021')
INSERT INTO #dataset
VALUES(1,'X','SFO','1/5/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/6/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/7/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/8/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/9/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/10/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/11/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/12/2021')
SELECT *
FROM #dataset
ORDER BY ContinuousDates
我有一套新的代码,为了更好的演示:
CREATE TABLE #dataset
(
ID int,
Name varchar(20),
City varchar(20),
ContinuousDates date
)
INSERT INTO #dataset
VALUES(1,'X','NYC','1/1/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/2/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/3/2021')
INSERT INTO #dataset
VALUES(1,'X','SFO','1/4/2021')
INSERT INTO #dataset
VALUES(1,'X','SFO','1/5/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/6/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/7/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/8/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/9/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/10/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/11/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/12/2021')
INSERT INTO #dataset
VALUES(2,'Y','MEL','1/13/2021')
INSERT INTO #dataset
VALUES(3,'Z','SYD','1/14/2021')
INSERT INTO #dataset
VALUES(3,'Z','SYD','1/15/2021')
INSERT INTO #dataset
VALUES(3,'Z','PER','1/16/2021')
INSERT INTO #dataset
VALUES(4,'A',NULL,'1/16/2021')
INSERT INTO #dataset
VALUES(4,'A', NULL,'1/17/2021')
SELECT *
FROM #dataset
ORDER BY ID, ContinuousDates
这是一种空岛问题。
有许多不同的解决方案。这是一个简单的
- 使用
LAG
来识别每个岛的起始行
- 运行 条件计数为我们提供了每个岛屿的 ID
- 然后简单地按该 ID 分组(连同任何其他分区列)
WITH StartPoints AS (
SELECT *,
IsStart = CASE WHEN LAG(City, 1, '') OVER (PARTITION BY ID ORDER BY ContinuousDates)
<> City THEN 1 END
FROM #dataset ds
),
Groups AS (
SELECT *,
GroupId = COUNT(IsStart) OVER (PARTITION BY ID ORDER BY ContinuousDates ROWS UNBOUNDED PRECEDING)
FROM StartPoints
)
SELECT
ID,
Name,
City = MIN(City),
DateStart = MIN(ContinuousDates),
DateEnd = MAX(ContinuousDates)
FROM Groups
GROUP BY
ID,
Name,
GroupId;
解决步骤:
- ID 和名称按日期排序的编号部分 (row_id)
- 包含 ID、姓名和城市的数字部分按日期排序 (p_row_id)
- 计算row_id - p_row_id
现在您在一组唯一的值中拥有每个时期的组号。
您只需按此号码、ID、姓名和城市分组
ID
Name
City
ContinuousDates
p_row_id
row_id
row_id - p_row_id
1
X
NYC
2021-01-01
1
1
0
1
X
NYC
2021-01-02
2
2
0
1
X
NYC
2021-01-03
3
3
0
1
X
SFO
2021-01-04
1
4
3
1
X
SFO
2021-01-05
2
5
3
1
X
PHY
2021-01-06
1
6
5
1
X
PHY
2021-01-07
2
7
5
1
X
PHY
2021-01-08
3
8
5
1
X
NYC
2021-01-09
4
9
5
1
X
NYC
2021-01-10
5
10
5
1
X
NYC
2021-01-11
6
11
5
1
X
NYC
2021-01-12
7
12
5
select
CD.ID
,CD.[Name]
,CD.City
,min(CD.ContinuousDates) as DateStart
,max(CD.ContinuousDates) as DateEnd
from
(
select *
,row_number() over(partition by CD.ID, CD.[Name], CD.City order by CD.ContinuousDates) as p_row_id
,row_number() over(partition by CD.ID, CD.[Name] order by CD.ContinuousDates) as row_id
from #dataset CD
) CD
group by CD.row_id - CD.p_row_id
,CD.ID
,CD.[Name]
,CD.City
order by DateStart
多列模板:
select
CD.GroupColumn1
,CD.GroupColumn2
..
,CD.Column1
,CD.Column2
,CD.Column3
,CD.Column4
..
,min(CD.ContinuousDates) as DateStart
,max(CD.ContinuousDates) as DateEnd
from
(
select *
,row_number() over(partition by
CD.GroupColumn1
,CD.GroupColumn2
..
,CD.Column1
,CD.Column2
,CD.Column3
,CD.Column4
..
order by CD.ContinuousDates) as p_row_id
,row_number() over(partition by
CD.GroupColumn1
,CD.GroupColumn2
..
order by CD.ContinuousDates) as row_id
from #dataset CD
) CD
group by CD.row_id - CD.p_row_id
,CD.GroupColumn1
,CD.GroupColumn2
..
CD.Column1
,CD.Column2
,CD.Column3
,CD.Column4
..
order by DateStart
我在 SQL Server 2016 中有一个有趣的情况。我正在使用 T-SQL 语言。
我有一个名为 (#dataset) 的数据集:
名为 ContinuousDates 的最后一列将始终具有连续的日期值,例如 2021 年 1 月 1 日至 2021 年 12 月 31 日。它永远不会有相同 ID 或姓名的重复日期,即给定的一个人一天只能有一行数据。 (在这个例子中,我只显示了一个人,ID = 1 和 Name = X。在我的实际数据中,我有多个人)。
请注意,纽约市出现在数据集中较早的位置,并在最后 4 行重复出现。
我需要根据日期范围获取以下数据集:
我尝试在数据集上使用简单的 MINIMUM 和 MAXIMUM,但我意识到有时我会得到 错误的 输出,如下所示:
我使用 RANK() 和 DENSE_RANK() 函数尝试了一些选项,但无法找到解决方案。有人可以为我提供帮助吗?
我在这里附上代码:
CREATE TABLE #dataset
(
ID int,
Name varchar(20),
City varchar(20),
ContinuousDates date
)
INSERT INTO #dataset
VALUES(1,'X','NYC','1/1/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/2/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/3/2021')
INSERT INTO #dataset
VALUES(1,'X','SFO','1/4/2021')
INSERT INTO #dataset
VALUES(1,'X','SFO','1/5/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/6/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/7/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/8/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/9/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/10/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/11/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/12/2021')
SELECT *
FROM #dataset
ORDER BY ContinuousDates
我有一套新的代码,为了更好的演示:
CREATE TABLE #dataset
(
ID int,
Name varchar(20),
City varchar(20),
ContinuousDates date
)
INSERT INTO #dataset
VALUES(1,'X','NYC','1/1/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/2/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/3/2021')
INSERT INTO #dataset
VALUES(1,'X','SFO','1/4/2021')
INSERT INTO #dataset
VALUES(1,'X','SFO','1/5/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/6/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/7/2021')
INSERT INTO #dataset
VALUES(1,'X','PHY','1/8/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/9/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/10/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/11/2021')
INSERT INTO #dataset
VALUES(1,'X','NYC','1/12/2021')
INSERT INTO #dataset
VALUES(2,'Y','MEL','1/13/2021')
INSERT INTO #dataset
VALUES(3,'Z','SYD','1/14/2021')
INSERT INTO #dataset
VALUES(3,'Z','SYD','1/15/2021')
INSERT INTO #dataset
VALUES(3,'Z','PER','1/16/2021')
INSERT INTO #dataset
VALUES(4,'A',NULL,'1/16/2021')
INSERT INTO #dataset
VALUES(4,'A', NULL,'1/17/2021')
SELECT *
FROM #dataset
ORDER BY ID, ContinuousDates
这是一种空岛问题。
有许多不同的解决方案。这是一个简单的
- 使用
LAG
来识别每个岛的起始行 - 运行 条件计数为我们提供了每个岛屿的 ID
- 然后简单地按该 ID 分组(连同任何其他分区列)
WITH StartPoints AS (
SELECT *,
IsStart = CASE WHEN LAG(City, 1, '') OVER (PARTITION BY ID ORDER BY ContinuousDates)
<> City THEN 1 END
FROM #dataset ds
),
Groups AS (
SELECT *,
GroupId = COUNT(IsStart) OVER (PARTITION BY ID ORDER BY ContinuousDates ROWS UNBOUNDED PRECEDING)
FROM StartPoints
)
SELECT
ID,
Name,
City = MIN(City),
DateStart = MIN(ContinuousDates),
DateEnd = MAX(ContinuousDates)
FROM Groups
GROUP BY
ID,
Name,
GroupId;
解决步骤:
- ID 和名称按日期排序的编号部分 (row_id)
- 包含 ID、姓名和城市的数字部分按日期排序 (p_row_id)
- 计算row_id - p_row_id
现在您在一组唯一的值中拥有每个时期的组号。
您只需按此号码、ID、姓名和城市分组
ID | Name | City | ContinuousDates | p_row_id | row_id | row_id - p_row_id |
---|---|---|---|---|---|---|
1 | X | NYC | 2021-01-01 | 1 | 1 | 0 |
1 | X | NYC | 2021-01-02 | 2 | 2 | 0 |
1 | X | NYC | 2021-01-03 | 3 | 3 | 0 |
1 | X | SFO | 2021-01-04 | 1 | 4 | 3 |
1 | X | SFO | 2021-01-05 | 2 | 5 | 3 |
1 | X | PHY | 2021-01-06 | 1 | 6 | 5 |
1 | X | PHY | 2021-01-07 | 2 | 7 | 5 |
1 | X | PHY | 2021-01-08 | 3 | 8 | 5 |
1 | X | NYC | 2021-01-09 | 4 | 9 | 5 |
1 | X | NYC | 2021-01-10 | 5 | 10 | 5 |
1 | X | NYC | 2021-01-11 | 6 | 11 | 5 |
1 | X | NYC | 2021-01-12 | 7 | 12 | 5 |
select
CD.ID
,CD.[Name]
,CD.City
,min(CD.ContinuousDates) as DateStart
,max(CD.ContinuousDates) as DateEnd
from
(
select *
,row_number() over(partition by CD.ID, CD.[Name], CD.City order by CD.ContinuousDates) as p_row_id
,row_number() over(partition by CD.ID, CD.[Name] order by CD.ContinuousDates) as row_id
from #dataset CD
) CD
group by CD.row_id - CD.p_row_id
,CD.ID
,CD.[Name]
,CD.City
order by DateStart
多列模板:
select
CD.GroupColumn1
,CD.GroupColumn2
..
,CD.Column1
,CD.Column2
,CD.Column3
,CD.Column4
..
,min(CD.ContinuousDates) as DateStart
,max(CD.ContinuousDates) as DateEnd
from
(
select *
,row_number() over(partition by
CD.GroupColumn1
,CD.GroupColumn2
..
,CD.Column1
,CD.Column2
,CD.Column3
,CD.Column4
..
order by CD.ContinuousDates) as p_row_id
,row_number() over(partition by
CD.GroupColumn1
,CD.GroupColumn2
..
order by CD.ContinuousDates) as row_id
from #dataset CD
) CD
group by CD.row_id - CD.p_row_id
,CD.GroupColumn1
,CD.GroupColumn2
..
CD.Column1
,CD.Column2
,CD.Column3
,CD.Column4
..
order by DateStart