SQL 服务器 - 按行号分组 - 间隙和孤岛
SQL Server - group by with row number - Gaps and Islands
我有一个 table 有这样的数据
+-------------+--------------+------------+----------------+
| CustomerSID | StartDateSID | EndDateSID | MarketingOptIn |
+-------------+--------------+------------+----------------+
| 12345 | 20101019 | 20131016 | Y |
| 12345 | 20131017 | 20140413 | Y |
| 12345 | 20140414 | 20140817 | N |
| 12345 | 20140818 | 20141228 | N |
| 12345 | 20141229 | 20150125 | Y |
| 12345 | 20150126 | 0 | Y |
+-------------+--------------+------------+----------------+
我需要在此 table 之上创建一个视图,以便将数据格式化为以下格式的标志,基本上是标志为 Y 或 N 的持续时间。
(EndDateSID - 0 当前处于活动状态,因此是今天的日期)
+-------------+--------------+------------+----------------+
| CustomerSID | StartDateSID | EndDateSID | MarketingOptIn |
+-------------+--------------+------------+----------------+
| 12345 | 20101019 | 20140413 | Y |
| 12345 | 20140414 | 20141228 | N |
| 12345 | 20141229 | 20150825 | Y |
+-------------+--------------+------------+----------------+
大多数客户的 Flag 仅更改一次,因此以下查询有效:
SELECT
CH1.CustomerSID
,MIN(CH1.StartDateSID) StartDate
,MAX(ISNULL(NULLIF(CH1.EndDateSID,0),CONVERT(INT, CONVERT(VARCHAR, GETDATE(), 112)))) EndDate
,CH1.MarketingOptIn
FROM DWH.DimCustomerHistory CH1
GROUP BY CH1.CustomerSID, CH1.MarketingOptIn
ORDER BY CH1.CustomerSID, CH1.MarketingOptIn
我怎样才能为像上面这样的客户实现预期的输出,多次更改标志?
您可以使用以下查询:
SELECT CustomerSID,
MIN(StartDateSID) AS StartDate,
MAX(ISNULL(NULLIF(EndDateSID,0),
CONVERT(INT, CONVERT(VARCHAR, GETDATE(), 112)))) AS EndDate,
MarketingOptIn
FROM (
SELECT CustomerSID, StartDateSID, EndDateSID, MarketingOptIn,
ROW_NUMBER() OVER (ORDER BY StartDateSID) -
ROW_NUMBER() OVER (PARTITION BY CustomerSID, MarketingOptIn
ORDER BY StartDateSID) AS grp
FROM DimCustomerHistory ) AS t
GROUP BY CustomerSID, MarketingOptIn, grp
ORDER BY StartDate
计算字段 grp
用于识别具有相同 MarketingOptIn
值的 连续 条记录。
在外部查询中利用此字段,您可以轻松地 GROUP BY
并以类似于原始查询的方式应用 MIN
和 MAX
聚合函数。
这是一个gaps and islands problem。您需要使用 ROW_NUMBER()
来确定您的差距,因此开始阶段将是:
SELECT CustomerSID,
StartDateSID,
EndDateSID,
MarketingOptIn,
TotalRowNum = ROW_NUMBER() OVER(PARTITION BY CustomerSID ORDER BY StartDateSID),
RowNumInGroup = ROW_NUMBER() OVER(PARTITION BY CustomerSID, MarketingOptIn ORDER BY StartDateSID),
GroupID = ROW_NUMBER() OVER(PARTITION BY CustomerSID ORDER BY StartDateSID) -
ROW_NUMBER() OVER(PARTITION BY CustomerSID, MarketingOptIn ORDER BY StartDateSID)
FROM dbo.YourTable;
输出:
CustomerSID StartDateSID EndDateSID MarketingOptIn TotalRowNum RowNumInGroup GroupID
---------------------------------------------------------------------------------------------------
12345 20101019 20131016 Y 1 1 0
12345 20131017 20140413 Y 2 2 0
12345 20140414 20140817 N 3 1 2
12345 20140818 20141228 N 4 2 2
12345 20141229 20150125 Y 5 3 2
12345 20150126 0 Y 6 4 2
这里的关键是,通过获取每一行的行号,以及组内每一行的行号,您可以获得一个唯一标识符 (GroupID + MarketingOptIn),用于标识您的每个岛屿。那么这只是在进行聚合时按此标识符分组的情况:
完整示例
DECLARE @T TABLE
(
CustomerSID INT,
StartDateSID INT,
EndDateSID INT,
MarketingOptIn CHAR(1)
)
INSERT @T
VALUES
(12345, 20101019, 20131016, 'Y'),
(12345, 20131017, 20140413, 'Y'),
(12345, 20140414, 20140817, 'N'),
(12345, 20140818, 20141228, 'N'),
(12345, 20141229, 20150125, 'Y'),
(12345, 20150126, 0, 'Y');
WITH CTE AS
(
SELECT CustomerSID,
StartDateSID,
EndDateSID,
MarketingOptIn,
GroupID = ROW_NUMBER() OVER(PARTITION BY CustomerSID ORDER BY StartDateSID) -
ROW_NUMBER() OVER(PARTITION BY CustomerSID, MarketingOptIn ORDER BY StartDateSID)
FROM @T
)
SELECT CustomerSID,
StartDateSID = MIN(StartDateSID),
EndDateSID = CASE WHEN MIN(EndDateSID) = 0 THEN CONVERT(INT, CONVERT(VARCHAR(8), GETDATE(), 112)) ELSE MAX(EndDateSID) END,
MarketingOptIn
FROM CTE
GROUP BY CustomerSID, MarketingOptIn, GroupID
ORDER BY CustomerSID, StartDateSID;
我有一个 table 有这样的数据
+-------------+--------------+------------+----------------+
| CustomerSID | StartDateSID | EndDateSID | MarketingOptIn |
+-------------+--------------+------------+----------------+
| 12345 | 20101019 | 20131016 | Y |
| 12345 | 20131017 | 20140413 | Y |
| 12345 | 20140414 | 20140817 | N |
| 12345 | 20140818 | 20141228 | N |
| 12345 | 20141229 | 20150125 | Y |
| 12345 | 20150126 | 0 | Y |
+-------------+--------------+------------+----------------+
我需要在此 table 之上创建一个视图,以便将数据格式化为以下格式的标志,基本上是标志为 Y 或 N 的持续时间。 (EndDateSID - 0 当前处于活动状态,因此是今天的日期)
+-------------+--------------+------------+----------------+
| CustomerSID | StartDateSID | EndDateSID | MarketingOptIn |
+-------------+--------------+------------+----------------+
| 12345 | 20101019 | 20140413 | Y |
| 12345 | 20140414 | 20141228 | N |
| 12345 | 20141229 | 20150825 | Y |
+-------------+--------------+------------+----------------+
大多数客户的 Flag 仅更改一次,因此以下查询有效:
SELECT
CH1.CustomerSID
,MIN(CH1.StartDateSID) StartDate
,MAX(ISNULL(NULLIF(CH1.EndDateSID,0),CONVERT(INT, CONVERT(VARCHAR, GETDATE(), 112)))) EndDate
,CH1.MarketingOptIn
FROM DWH.DimCustomerHistory CH1
GROUP BY CH1.CustomerSID, CH1.MarketingOptIn
ORDER BY CH1.CustomerSID, CH1.MarketingOptIn
我怎样才能为像上面这样的客户实现预期的输出,多次更改标志?
您可以使用以下查询:
SELECT CustomerSID,
MIN(StartDateSID) AS StartDate,
MAX(ISNULL(NULLIF(EndDateSID,0),
CONVERT(INT, CONVERT(VARCHAR, GETDATE(), 112)))) AS EndDate,
MarketingOptIn
FROM (
SELECT CustomerSID, StartDateSID, EndDateSID, MarketingOptIn,
ROW_NUMBER() OVER (ORDER BY StartDateSID) -
ROW_NUMBER() OVER (PARTITION BY CustomerSID, MarketingOptIn
ORDER BY StartDateSID) AS grp
FROM DimCustomerHistory ) AS t
GROUP BY CustomerSID, MarketingOptIn, grp
ORDER BY StartDate
计算字段 grp
用于识别具有相同 MarketingOptIn
值的 连续 条记录。
在外部查询中利用此字段,您可以轻松地 GROUP BY
并以类似于原始查询的方式应用 MIN
和 MAX
聚合函数。
这是一个gaps and islands problem。您需要使用 ROW_NUMBER()
来确定您的差距,因此开始阶段将是:
SELECT CustomerSID,
StartDateSID,
EndDateSID,
MarketingOptIn,
TotalRowNum = ROW_NUMBER() OVER(PARTITION BY CustomerSID ORDER BY StartDateSID),
RowNumInGroup = ROW_NUMBER() OVER(PARTITION BY CustomerSID, MarketingOptIn ORDER BY StartDateSID),
GroupID = ROW_NUMBER() OVER(PARTITION BY CustomerSID ORDER BY StartDateSID) -
ROW_NUMBER() OVER(PARTITION BY CustomerSID, MarketingOptIn ORDER BY StartDateSID)
FROM dbo.YourTable;
输出:
CustomerSID StartDateSID EndDateSID MarketingOptIn TotalRowNum RowNumInGroup GroupID
---------------------------------------------------------------------------------------------------
12345 20101019 20131016 Y 1 1 0
12345 20131017 20140413 Y 2 2 0
12345 20140414 20140817 N 3 1 2
12345 20140818 20141228 N 4 2 2
12345 20141229 20150125 Y 5 3 2
12345 20150126 0 Y 6 4 2
这里的关键是,通过获取每一行的行号,以及组内每一行的行号,您可以获得一个唯一标识符 (GroupID + MarketingOptIn),用于标识您的每个岛屿。那么这只是在进行聚合时按此标识符分组的情况:
完整示例
DECLARE @T TABLE
(
CustomerSID INT,
StartDateSID INT,
EndDateSID INT,
MarketingOptIn CHAR(1)
)
INSERT @T
VALUES
(12345, 20101019, 20131016, 'Y'),
(12345, 20131017, 20140413, 'Y'),
(12345, 20140414, 20140817, 'N'),
(12345, 20140818, 20141228, 'N'),
(12345, 20141229, 20150125, 'Y'),
(12345, 20150126, 0, 'Y');
WITH CTE AS
(
SELECT CustomerSID,
StartDateSID,
EndDateSID,
MarketingOptIn,
GroupID = ROW_NUMBER() OVER(PARTITION BY CustomerSID ORDER BY StartDateSID) -
ROW_NUMBER() OVER(PARTITION BY CustomerSID, MarketingOptIn ORDER BY StartDateSID)
FROM @T
)
SELECT CustomerSID,
StartDateSID = MIN(StartDateSID),
EndDateSID = CASE WHEN MIN(EndDateSID) = 0 THEN CONVERT(INT, CONVERT(VARCHAR(8), GETDATE(), 112)) ELSE MAX(EndDateSID) END,
MarketingOptIn
FROM CTE
GROUP BY CustomerSID, MarketingOptIn, GroupID
ORDER BY CustomerSID, StartDateSID;