SQL 根据条件拆分岛屿
SQL Split Island On Criteria
我有一个 SQL table From
和 To
日期如下:
Row From To
--------------------------------------------------
1 2017-10-28 00:00:00 2017-10-30 00:00:00
2 2017-10-30 00:00:00 2017-10-31 00:00:00
3 2017-10-31 00:00:00 2017-10-31 07:30:00
4 2017-10-31 14:41:00 2017-10-31 15:14:00
5 2017-10-31 17:13:00 2017-11-01 00:00:00
6 2017-11-01 00:00:00 2017-11-01 23:45:00
7 2017-11-02 03:13:00 2017-11-02 07:56:00
我需要将连续的数据分组到孤岛中。数据不重叠。使用此查询可以轻松完成此操作:
;with Islands as
(
SELECT
min([From]) as [From]
,max([To]) as [To]
FROM
(
select
[From],
[To],
sum(startGroup) over (order by [From]) StartGroup
from
(
SELECT
[From],
[To],
(case when [From] <= lag([To]) over (order by [From])
then 0
else 1
end) as StartGroup
FROM dbo.DateTable
) IsNewIsland
) GroupedIsland
group by StartGroup
)
select *
from Islands
然后给我这些结果:
From To Rows
-----------------------------------------------------
2017-10-28 00:00:00 2017-10-31 07:30:00 1-3
2017-10-31 14:41:00 2017-10-31 15:14:00 4
2017-10-31 17:13:00 2017-11-01 23:45:00 5-6
2017-11-02 03:13:00 2017-11-02 07:56:00 7
我遇到的问题是,一旦岛屿获得足够的记录以达到一定的总持续时间,我就需要将查询修改为 cap/split 岛屿。这是一个 input/hardcoded 值。拆分包括整个记录,而不是在记录的 From-To
范围中间进行拆分。例如,我需要将岛屿拆分为 27 小时。这将给出以下结果:
From To Rows
-----------------------------------------------------
2017-10-29 00:00:00 2017-10-30 00:00:00 1
2017-10-30 00:00:00 2017-10-31 07:30:00 2-3
2017-10-31 17:13:00 2017-11-01 23:45:00 5-6
第一个岛被拆分是因为第 1 行和第 2 行单独创建了 27 小时的时间段。第 4 行和第 7 行不足以创建一个岛,因此将其忽略。
我尝试通过内部 select 中的 lag
函数提取此信息来计算跨行的 "rolling duration",但它不适用于跨越超过 2 行的岛屿因为它只会跟踪最后一行的持续时间,我无法"carry"向前计算。
SELECT
[From],
[To],
(case when [From] <= lag([To]) over (order by [From]
then (datediff(minute, [From], [To]) + lag(datediff(minute, [From], [To])) over (order by [From]))
else datediff(minute, [From], [To])
end) as RollingDuration,
(case when [From] <= lag([To]) over (order by [From])
then 0
else 1
end) as StartGroup
FROM dbo.DateTable
我能想到的 "least worst" 方法是 "quirky update"。 (Google这个,老实说不是我编的。)
- http://www.sqlservercentral.com/articles/T-SQL/68467/
- 将数据复制到一个新的 table 中,其中包含一个或多个附加(空白)字段
- 使用 CLUSTERED PRIMARY KEY 确保以正确的顺序更新行
- 使用
UPDATE
和用户变量遍历行并存储计算结果
如果有间隙,或者 运行 总时长达到 27 小时,我可以使用它开始一个新组。然后照常进行。
-- New table to work through
----------------------------------------------------------------------
-- Addition [group_start] field (identifies groups, and useful data)
-- PRIMARY KEY CLUSTERED to enforce the order rows will be processed
----------------------------------------------------------------------
CREATE TABLE sample (
id INT,
start DATETIME,
cease DATETIME,
group_start DATETIME DEFAULT(0),
PRIMARY KEY CLUSTERED (group_start, start) -- To force the order we will iterate the rows, and is useful in last step
);
INSERT INTO
sample (
id,
start,
cease
)
VALUES
(1, '2017-10-28 00:00:00', '2017-10-30 00:00:00'),
(2, '2017-10-30 00:00:00', '2017-10-31 00:00:00'),
(3, '2017-10-31 00:00:00', '2017-10-31 07:30:00'),
(4, '2017-10-31 14:41:00', '2017-10-31 15:14:00'),
(5, '2017-10-31 17:13:00', '2017-11-01 00:00:00'),
(6, '2017-11-01 00:00:00', '2017-11-01 23:45:00'),
(7, '2017-11-02 03:13:00', '2017-11-02 07:56:00')
;
-- Quirky Update
----------------------------------------------------------------------
-- Update [group_start] to the start of the current group
-- -> new group if gap since previous row
-- -> new group if previous row took group to 27 hours
-- -> else same group as previous row
----------------------------------------------------------------------
DECLARE @grp_start DATETIME = 0;
WITH
lagged AS
(
SELECT *, LAG(cease) OVER (ORDER BY group_start, start) AS lag_cease FROM sample
)
UPDATE
lagged
SET
@grp_start
= group_start
= CASE WHEN start <> lag_cease THEN start
WHEN start >= DATEADD(hour, 27, @grp_start) THEN start
ELSE @grp_start END
OPTION
(MAXDOP 1)
;
-- Standard SQL to apply other logic
----------------------------------------------------------------------
-- MAX() OVER () to find end time of each group
-- WHERE to filter out any groups under 12 hours long
----------------------------------------------------------------------
SELECT
*
FROM
(
SELECT
*,
MAX(cease) OVER (PARTITION BY group_start) AS group_cease
FROM
sample
)
bounded_groups
WHERE
group_cease >= DATEADD(hour, 12, group_start)
;
http://dbfiddle.uk/?rdbms=sqlserver_2017&fiddle=1bec5b3fe920c1affd58f23a11e280a0
我有一个 SQL table From
和 To
日期如下:
Row From To
--------------------------------------------------
1 2017-10-28 00:00:00 2017-10-30 00:00:00
2 2017-10-30 00:00:00 2017-10-31 00:00:00
3 2017-10-31 00:00:00 2017-10-31 07:30:00
4 2017-10-31 14:41:00 2017-10-31 15:14:00
5 2017-10-31 17:13:00 2017-11-01 00:00:00
6 2017-11-01 00:00:00 2017-11-01 23:45:00
7 2017-11-02 03:13:00 2017-11-02 07:56:00
我需要将连续的数据分组到孤岛中。数据不重叠。使用此查询可以轻松完成此操作:
;with Islands as
(
SELECT
min([From]) as [From]
,max([To]) as [To]
FROM
(
select
[From],
[To],
sum(startGroup) over (order by [From]) StartGroup
from
(
SELECT
[From],
[To],
(case when [From] <= lag([To]) over (order by [From])
then 0
else 1
end) as StartGroup
FROM dbo.DateTable
) IsNewIsland
) GroupedIsland
group by StartGroup
)
select *
from Islands
然后给我这些结果:
From To Rows
-----------------------------------------------------
2017-10-28 00:00:00 2017-10-31 07:30:00 1-3
2017-10-31 14:41:00 2017-10-31 15:14:00 4
2017-10-31 17:13:00 2017-11-01 23:45:00 5-6
2017-11-02 03:13:00 2017-11-02 07:56:00 7
我遇到的问题是,一旦岛屿获得足够的记录以达到一定的总持续时间,我就需要将查询修改为 cap/split 岛屿。这是一个 input/hardcoded 值。拆分包括整个记录,而不是在记录的 From-To
范围中间进行拆分。例如,我需要将岛屿拆分为 27 小时。这将给出以下结果:
From To Rows
-----------------------------------------------------
2017-10-29 00:00:00 2017-10-30 00:00:00 1
2017-10-30 00:00:00 2017-10-31 07:30:00 2-3
2017-10-31 17:13:00 2017-11-01 23:45:00 5-6
第一个岛被拆分是因为第 1 行和第 2 行单独创建了 27 小时的时间段。第 4 行和第 7 行不足以创建一个岛,因此将其忽略。
我尝试通过内部 select 中的 lag
函数提取此信息来计算跨行的 "rolling duration",但它不适用于跨越超过 2 行的岛屿因为它只会跟踪最后一行的持续时间,我无法"carry"向前计算。
SELECT
[From],
[To],
(case when [From] <= lag([To]) over (order by [From]
then (datediff(minute, [From], [To]) + lag(datediff(minute, [From], [To])) over (order by [From]))
else datediff(minute, [From], [To])
end) as RollingDuration,
(case when [From] <= lag([To]) over (order by [From])
then 0
else 1
end) as StartGroup
FROM dbo.DateTable
我能想到的 "least worst" 方法是 "quirky update"。 (Google这个,老实说不是我编的。)
- http://www.sqlservercentral.com/articles/T-SQL/68467/
- 将数据复制到一个新的 table 中,其中包含一个或多个附加(空白)字段
- 使用 CLUSTERED PRIMARY KEY 确保以正确的顺序更新行
- 使用
UPDATE
和用户变量遍历行并存储计算结果
如果有间隙,或者 运行 总时长达到 27 小时,我可以使用它开始一个新组。然后照常进行。
-- New table to work through
----------------------------------------------------------------------
-- Addition [group_start] field (identifies groups, and useful data)
-- PRIMARY KEY CLUSTERED to enforce the order rows will be processed
----------------------------------------------------------------------
CREATE TABLE sample (
id INT,
start DATETIME,
cease DATETIME,
group_start DATETIME DEFAULT(0),
PRIMARY KEY CLUSTERED (group_start, start) -- To force the order we will iterate the rows, and is useful in last step
);
INSERT INTO
sample (
id,
start,
cease
)
VALUES
(1, '2017-10-28 00:00:00', '2017-10-30 00:00:00'),
(2, '2017-10-30 00:00:00', '2017-10-31 00:00:00'),
(3, '2017-10-31 00:00:00', '2017-10-31 07:30:00'),
(4, '2017-10-31 14:41:00', '2017-10-31 15:14:00'),
(5, '2017-10-31 17:13:00', '2017-11-01 00:00:00'),
(6, '2017-11-01 00:00:00', '2017-11-01 23:45:00'),
(7, '2017-11-02 03:13:00', '2017-11-02 07:56:00')
;
-- Quirky Update
----------------------------------------------------------------------
-- Update [group_start] to the start of the current group
-- -> new group if gap since previous row
-- -> new group if previous row took group to 27 hours
-- -> else same group as previous row
----------------------------------------------------------------------
DECLARE @grp_start DATETIME = 0;
WITH
lagged AS
(
SELECT *, LAG(cease) OVER (ORDER BY group_start, start) AS lag_cease FROM sample
)
UPDATE
lagged
SET
@grp_start
= group_start
= CASE WHEN start <> lag_cease THEN start
WHEN start >= DATEADD(hour, 27, @grp_start) THEN start
ELSE @grp_start END
OPTION
(MAXDOP 1)
;
-- Standard SQL to apply other logic
----------------------------------------------------------------------
-- MAX() OVER () to find end time of each group
-- WHERE to filter out any groups under 12 hours long
----------------------------------------------------------------------
SELECT
*
FROM
(
SELECT
*,
MAX(cease) OVER (PARTITION BY group_start) AS group_cease
FROM
sample
)
bounded_groups
WHERE
group_cease >= DATEADD(hour, 12, group_start)
;
http://dbfiddle.uk/?rdbms=sqlserver_2017&fiddle=1bec5b3fe920c1affd58f23a11e280a0