逐列获取最后一个非空值,其中列按日期排序
Get last non null value columnwise where column is sorted by date
select *
from example;
edate userid status
2022-05-01 abc123 true
2022-05-02 abc123 (null)
2022-05-03 abc123 (null)
2022-05-04 abc123 (null)
2022-05-05 abc123 false
2022-05-06 abc123 (null)
2022-05-07 abc123 (null)
2022-05-08 abc123 (null)
2022-05-09 abc123 true
2022-05-10 abc123 (null)
我想写一个新字段,'status_backfilled' 基于 userId 的最新数据点。
在示例数据中,用户状态在 5 月 1 日为 true,然后直到 5 月 5 日为 null。因此,我希望新字段在 5 月 1 日至 5 月 4 日之间为真。然后状态切换为 false。这个值直到 5 月 9 日都没有变化,所以我想要在 5 月 5 日到 8 日之间为 false,然后再次为 true。
期望的输出:
select *
from example_desired;
edate userid status_backfilled
2022-05-01 abc123 true
2022-05-02 abc123 true
2022-05-03 abc123 true
2022-05-04 abc123 true
2022-05-05 abc123 false
2022-05-06 abc123 false
2022-05-07 abc123 false
2022-05-08 abc123 false
2022-05-09 abc123 true
2022-05-10 abc123 true
如何按列合并以获得数据已排序的用户的最新非空状态,在本例中是按日期?
您可以通过使用一些 window 函数来实现您想要的结果 -
WITH grp AS (SELECT edate, userid, status,
CASE WHEN status IS NULL THEN 0
ELSE ROW_NUMBER() OVER(ORDER BY edate)
END RN
FROM example
),
grp_sum AS (SELECT edate, userid, status, SUM(RN) OVER(ORDER BY edate) grp_sum
FROM grp
)
SELECT edate, userid,
FIRST_VALUE(status) OVER(PARTITION BY grp_sum ORDER BY status NULLS LAST) status_backfilled
FROM grp_sum;
实际上,甚至更好:
select e1.edate, e1.userId, coalesce(e1.status, t.status) as status
from example e1
cross join lateral (
select status from example e2
where e1.userid = e2.userid
and e1.edate > e2.edate
and e2.status is not null
order by e2.edate desc limit 1
) t
这是另一种方式:
with cte as (
select e.* ,e_s.edate s_edate, e_s.status s_status , row_number() over (partition by e.userid,e.edate order by e_s.edate desc) rn
from example e
left join (
select *
from example
where status is not null
) e_s on e.userid = e_s.userid
and e_s.edate < e.edate
)
select edate, userId, coalesce(status, s_status) as status
from cte where rn = 1
select *
from example;
edate userid status
2022-05-01 abc123 true
2022-05-02 abc123 (null)
2022-05-03 abc123 (null)
2022-05-04 abc123 (null)
2022-05-05 abc123 false
2022-05-06 abc123 (null)
2022-05-07 abc123 (null)
2022-05-08 abc123 (null)
2022-05-09 abc123 true
2022-05-10 abc123 (null)
我想写一个新字段,'status_backfilled' 基于 userId 的最新数据点。
在示例数据中,用户状态在 5 月 1 日为 true,然后直到 5 月 5 日为 null。因此,我希望新字段在 5 月 1 日至 5 月 4 日之间为真。然后状态切换为 false。这个值直到 5 月 9 日都没有变化,所以我想要在 5 月 5 日到 8 日之间为 false,然后再次为 true。
期望的输出:
select *
from example_desired;
edate userid status_backfilled
2022-05-01 abc123 true
2022-05-02 abc123 true
2022-05-03 abc123 true
2022-05-04 abc123 true
2022-05-05 abc123 false
2022-05-06 abc123 false
2022-05-07 abc123 false
2022-05-08 abc123 false
2022-05-09 abc123 true
2022-05-10 abc123 true
如何按列合并以获得数据已排序的用户的最新非空状态,在本例中是按日期?
您可以通过使用一些 window 函数来实现您想要的结果 -
WITH grp AS (SELECT edate, userid, status,
CASE WHEN status IS NULL THEN 0
ELSE ROW_NUMBER() OVER(ORDER BY edate)
END RN
FROM example
),
grp_sum AS (SELECT edate, userid, status, SUM(RN) OVER(ORDER BY edate) grp_sum
FROM grp
)
SELECT edate, userid,
FIRST_VALUE(status) OVER(PARTITION BY grp_sum ORDER BY status NULLS LAST) status_backfilled
FROM grp_sum;
实际上,甚至更好:
select e1.edate, e1.userId, coalesce(e1.status, t.status) as status
from example e1
cross join lateral (
select status from example e2
where e1.userid = e2.userid
and e1.edate > e2.edate
and e2.status is not null
order by e2.edate desc limit 1
) t
这是另一种方式:
with cte as (
select e.* ,e_s.edate s_edate, e_s.status s_status , row_number() over (partition by e.userid,e.edate order by e_s.edate desc) rn
from example e
left join (
select *
from example
where status is not null
) e_s on e.userid = e_s.userid
and e_s.edate < e.edate
)
select edate, userId, coalesce(status, s_status) as status
from cte where rn = 1