基于 Redshift 中的连续标志分组(间隙和孤岛问题)
Group By based on consequtive flag in Redshift (Gaps and Islands problem)
我正在尝试解决“差距和孤岛”并将连续检查组合在一起。我的数据看起来像这样
site_id date_id location_id reservation_id revenue
5 20210101 125 792727 100
5 20210101 126 792728 90
5 20210101 228 792757 200
5 20210102 217 792977 50
5 20210102 218 792978 120
5 20210102 219 792979 100
我想在同一日期和 site_id 内按连续的 location_id 和连续的 reservation_id(两者应该分别是连续的)分组,然后计算收入总和。所以对于上面的例子,输出应该是:
site_id date_id location_id reservation_id revenue
5 20210101 125 792727 190
5 20210101 228 792757 200
5 20210102 217 792977 270
Location_id 和 reservation_id 除了这个特定任务外并不重要,因此这两列的简单 MAX() 或 MIN() 将起作用。
试试这个:
with mytable as (
select 5 as site_id, '20210101' as date_id, 125 as location_id, 792727 as reservation_id, 100 as revenue union all
select 5, '20210101', 126, 792728, 90 union all
select 5, '20210101', 228, 792757, 200 union all
select 5, '20210102', 217, 792977, 50 union all
select 5, '20210102', 218, 792978, 120 union all
select 5, '20210102', 219, 792979, 100
)
select site_id, date_id, min(location_id) as location_id, min(reservation_id) as reservation_id, sum(revenue) as revenue
from (
select *, count(nullif(is_new_group, false)) over (order by site_id, date_id, location_id rows unbounded preceding) as new_group_id
from (
select *, coalesce(lag(location_id) over(partition by site_id, date_id order by location_id) != location_id-1, true) as is_new_group
from mytable
) a
) b
group by site_id, date_id, new_group_id
order by new_group_id
尝试会话化:
两个嵌套查询。首先,计数器在条件为假时为 0,为真时为 1;在我们的例子中,之前的预订 ID 不正好比当前的小 1。
第二个查询查询第一个查询,对之前得到的counter进行运行求和。这给出了一个会话 ID。
然后,根据site id、date id和获取的session id进行分组
WITH
indata(site_id,date_id,location_id,reservation_id,revenue) AS (
SELECT 5,DATE '2021-01-01',125,792727,100
UNION ALL SELECT 5,DATE '2021-01-01',126,792728,90
UNION ALL SELECT 5,DATE '2021-01-01',228,792757,200
UNION ALL SELECT 5,DATE '2021-01-02',217,792977,50
UNION ALL SELECT 5,DATE '2021-01-02',218,792978,120
UNION ALL SELECT 5,DATE '2021-01-02',219,792979,100
)
,
with_counter AS (
SELECT
site_id
, date_id
, location_id
, reservation_id
, revenue
, CASE
WHEN reservation_id - LAG(reservation_id) OVER(
PARTITION BY site_id ORDER BY date_id,reservation_id
) > 1
THEN 1
ELSE 0
END AS counter
FROM indata
)
,
with_session AS (
SELECT
site_id
, date_id
, location_id
, reservation_id
, revenue
, SUM(counter) OVER(
PARTITION BY site_id ORDER BY date_id,reservation_id
) AS session_id
FROM with_counter
-- test output ...
-- out site_id | date_id | location_id | reservation_id | revenue | session_id
-- out ---------+------------+-------------+----------------+---------+------------
-- out 5 | 2021-01-01 | 125 | 792727 | 100 | 0
-- out 5 | 2021-01-01 | 126 | 792728 | 90 | 0
-- out 5 | 2021-01-01 | 228 | 792757 | 200 | 1
-- out 5 | 2021-01-02 | 217 | 792977 | 50 | 2
-- out 5 | 2021-01-02 | 218 | 792978 | 120 | 2
-- out 5 | 2021-01-02 | 219 | 792979 | 100 | 2
)
SELECT
site_id
, date_id
, MIN(location_id ) AS location_id
, MIN(reservation_id) AS reservation_id
, SUM(revenue ) AS revenue
FROM with_session
GROUP BY
site_id
, date_id
, session_id
ORDER BY
site_id
, date_id
;
-- out site_id | date_id | location_id | reservation_id | revenue
-- out ---------+------------+-------------+----------------+---------
-- out 5 | 2021-01-01 | 125 | 792727 | 190
-- out 5 | 2021-01-01 | 228 | 792757 | 200
-- out 5 | 2021-01-02 | 217 | 792977 | 270
我正在尝试解决“差距和孤岛”并将连续检查组合在一起。我的数据看起来像这样
site_id date_id location_id reservation_id revenue
5 20210101 125 792727 100
5 20210101 126 792728 90
5 20210101 228 792757 200
5 20210102 217 792977 50
5 20210102 218 792978 120
5 20210102 219 792979 100
我想在同一日期和 site_id 内按连续的 location_id 和连续的 reservation_id(两者应该分别是连续的)分组,然后计算收入总和。所以对于上面的例子,输出应该是:
site_id date_id location_id reservation_id revenue
5 20210101 125 792727 190
5 20210101 228 792757 200
5 20210102 217 792977 270
Location_id 和 reservation_id 除了这个特定任务外并不重要,因此这两列的简单 MAX() 或 MIN() 将起作用。
试试这个:
with mytable as (
select 5 as site_id, '20210101' as date_id, 125 as location_id, 792727 as reservation_id, 100 as revenue union all
select 5, '20210101', 126, 792728, 90 union all
select 5, '20210101', 228, 792757, 200 union all
select 5, '20210102', 217, 792977, 50 union all
select 5, '20210102', 218, 792978, 120 union all
select 5, '20210102', 219, 792979, 100
)
select site_id, date_id, min(location_id) as location_id, min(reservation_id) as reservation_id, sum(revenue) as revenue
from (
select *, count(nullif(is_new_group, false)) over (order by site_id, date_id, location_id rows unbounded preceding) as new_group_id
from (
select *, coalesce(lag(location_id) over(partition by site_id, date_id order by location_id) != location_id-1, true) as is_new_group
from mytable
) a
) b
group by site_id, date_id, new_group_id
order by new_group_id
尝试会话化:
两个嵌套查询。首先,计数器在条件为假时为 0,为真时为 1;在我们的例子中,之前的预订 ID 不正好比当前的小 1。
第二个查询查询第一个查询,对之前得到的counter进行运行求和。这给出了一个会话 ID。
然后,根据site id、date id和获取的session id进行分组
WITH
indata(site_id,date_id,location_id,reservation_id,revenue) AS (
SELECT 5,DATE '2021-01-01',125,792727,100
UNION ALL SELECT 5,DATE '2021-01-01',126,792728,90
UNION ALL SELECT 5,DATE '2021-01-01',228,792757,200
UNION ALL SELECT 5,DATE '2021-01-02',217,792977,50
UNION ALL SELECT 5,DATE '2021-01-02',218,792978,120
UNION ALL SELECT 5,DATE '2021-01-02',219,792979,100
)
,
with_counter AS (
SELECT
site_id
, date_id
, location_id
, reservation_id
, revenue
, CASE
WHEN reservation_id - LAG(reservation_id) OVER(
PARTITION BY site_id ORDER BY date_id,reservation_id
) > 1
THEN 1
ELSE 0
END AS counter
FROM indata
)
,
with_session AS (
SELECT
site_id
, date_id
, location_id
, reservation_id
, revenue
, SUM(counter) OVER(
PARTITION BY site_id ORDER BY date_id,reservation_id
) AS session_id
FROM with_counter
-- test output ...
-- out site_id | date_id | location_id | reservation_id | revenue | session_id
-- out ---------+------------+-------------+----------------+---------+------------
-- out 5 | 2021-01-01 | 125 | 792727 | 100 | 0
-- out 5 | 2021-01-01 | 126 | 792728 | 90 | 0
-- out 5 | 2021-01-01 | 228 | 792757 | 200 | 1
-- out 5 | 2021-01-02 | 217 | 792977 | 50 | 2
-- out 5 | 2021-01-02 | 218 | 792978 | 120 | 2
-- out 5 | 2021-01-02 | 219 | 792979 | 100 | 2
)
SELECT
site_id
, date_id
, MIN(location_id ) AS location_id
, MIN(reservation_id) AS reservation_id
, SUM(revenue ) AS revenue
FROM with_session
GROUP BY
site_id
, date_id
, session_id
ORDER BY
site_id
, date_id
;
-- out site_id | date_id | location_id | reservation_id | revenue
-- out ---------+------------+-------------+----------------+---------
-- out 5 | 2021-01-01 | 125 | 792727 | 190
-- out 5 | 2021-01-01 | 228 | 792757 | 200
-- out 5 | 2021-01-02 | 217 | 792977 | 270