如何填写缺失的日期
How to fill in missing dates
我有一个table如下:
create table users as(sent_at date, user_email varchar(128), score int, quota int);
insert into users
select '2022-01-18','user@email.com',50,200
union
select '2022-01-15','user@email.com',34,400;
我希望将结果集设置为:
select '2022-01-19','user@email.com',85,100 union
select '2022-01-18','user@email.com',50,200 union
select '2022-01-17','user@email.com',0,200 union --Missing date with 0 score but quota from prior line
select '2022-01-16','user@email.com',0,200 union --Missing date with 0 score but quota from prior line
select '2022-01-15','user@email.com',34,400;
我已经有一个包含所有日期的 table,例如schema.dates。
table 中有成千上万的用户需要我填补这些空白。我尝试了一堆解决方案,但没有任何效果。
我怎样才能做到这一点?
这是一次尝试。如果添加一个用户,它确实有效;我需要多个用户的示例数据,sent_at
日期在不同的日期,以查看它是否适用于许多用户。
WITH
-- your users table after inserting the two rows you insert,
-- and with a previous row dated '2022-01-19' - don't use in query
users(sent_at,user_email,score,quota) AS (
SELECT DATE '2022-01-19','user@email.com',85,100
UNION ALL SELECT DATE '2022-01-18','user@email.com',50,200
UNION ALL SELECT DATE '2022-01-15','user@email.com',34,400
)
,
-- a cutout of the calendar table you mentioned - don't use in query either
calendar(dt) AS (
SELECT DATE '2022-01-14'
UNION ALL SELECT DATE '2022-01-15'
UNION ALL SELECT DATE '2022-01-16'
UNION ALL SELECT DATE '2022-01-17'
UNION ALL SELECT DATE '2022-01-18'
UNION ALL SELECT DATE '2022-01-19'
UNION ALL SELECT DATE '2022-01-20'
)
-- real query starts here, replace following comma with "WITH" ...
,
-- can be done differently, but I like determining filters early ..
limits AS (
SELECT
MIN(sent_at) AS mindt
, MAX(sent_at) AS maxdt
FROM users
)
,
-- fill the time gaps with a left join from calendar to users table
gaps_filled AS (
SELECT
dt AS sent_at
, user_email
, NVL(score,0) AS score
, quota
FROM calendar
LEFT JOIN users ON calendar.dt=users.sent_at
)
-- use LAST_VALUE( .. IGNORE NULLS) to fill the NULL gaps from earlier rows
-- CROSS JOIN with the in-line limits table to get the filter dates
SELECT
sent_at
, LAST_VALUE(gaps_filled.user_email IGNORE NULLS)
OVER(ORDER BY sent_at DESC) AS user_email
, NVL(score,0) AS score
, LAST_VALUE(quota IGNORE NULLS)
OVER(ORDER BY sent_at DESC) AS quota
FROM gaps_filled
CROSS JOIN limits
WHERE sent_at BETWEEN mindt AND maxdt
ORDER BY user_email, sent_at DESC;
-- out Null display is "(null)".
-- out sent_at | user_email | score | quota
-- out ------------+----------------+-------+-------
-- out 2022-01-19 | user@email.com | 85 | 100
-- out 2022-01-18 | user@email.com | 50 | 200
-- out 2022-01-17 | user@email.com | 0 | 200
-- out 2022-01-16 | user@email.com | 0 | 200
-- out 2022-01-15 | user@email.com | 34 | 400
这是一个可行的查询。首先交叉连接日期和用户的所有组合(根据需要添加过滤器),然后左连接用户 table 并使用 last_value() 函数计算配额(请注意,如果您使用的是 Snowflake,则必须按照记录 here):
指定“无限制的前一行和当前行之间的行”
with all_dates_users as (
--all combinations of dates and users
select date, user
from dates
cross join (select distinct user_email as user from users)
),
joined as (
--left join users table to the previous
select DU.date, DU.user, U.sent_at, U.user_email, U.score, U.quota
from all_dates_users DU
left join users U on U.sent_at = DU.date and U.user_email = DU.user
)
--calculate quota as previous quota using last_value() function
select date, user, nvl(score, 0) as score, last_value(quota) ignore nulls over (partition by user order by date desc rows between unbounded preceding and current row) as quota
from joined
order by date desc;
Maja 的解决方案使用 WINDOW FUCNTION LAST_VALUE 效果很好,Snowflake 有另一种表达方式,它更短 LAG IGNORE NULL OVER 形式。
WITH dates AS (
SELECT DATEADD(day, ROW_NUMBER() OVER(ORDER BY TRUE),'2022-01-01')::date AS date
FROM TABLE(generator(ROWCOUNT => 50))
), new_data AS (
SELECT * FROM VALUES
('2022-01-19','user@email.com',85,100),
('2022-01-18','user@email.com',50,200),
('2022-01-15','user@email.com',34,400)
v(date, user_email, score , quota )
), users_ranges AS (
SELECT
user_email,
min(date) as min_date,
max(date) as max_date
FROM new_data
GROUP BY 1
), dates_for_users AS (
SELECT d.date,
u.user_email
FROM users_ranges AS u
JOIN dates AS d
ON d.date between u.min_date and u.max_date
)
SELECT du.date,
du.user_email,
ZEROIFNULL(u.score) as score,
NVL(u.quota, LAG(u.quota) IGNORE NULLS OVER(partition by du.user_email order by du.date)) as quota
FROM dates_for_users AS du
LEFT JOIN new_data AS u
ON du.date = u.date AND du.user_email = u.user_email
ORDER BY 1 desc,2;
给出:
DATE
USER_EMAIL
SCORE
QUOTA
2022-01-19
user@email.com
85
100
2022-01-18
user@email.com
50
200
2022-01-17
user@email.com
0
400
2022-01-16
user@email.com
0
400
2022-01-15
user@email.com
34
400
正如你的问题所描述的那样,你只给出了两行,但希望有一个大的日期范围,但我认为这不是问题的核心,你只是真的想要滞后/LAST_VALUE逻辑。
这由 dates
和 new_data
提供,只是 table 代理。然后 users_ranges
用于查找每个用户的时间范围,该时间范围将用于发生在 dates_for_users
中的 dates
table 的加入日期。现在我们为每个用户填充了所有日期间隙,我们可以左连接到新数据,以获得分数和 zero if null, and use LAG to get the prior QUOTA ignoring nulls, which a little NVL 仅在当前行没有值时才使用 LAG 值(COALESCE 也可以是用于此)。
我有一个table如下:
create table users as(sent_at date, user_email varchar(128), score int, quota int);
insert into users
select '2022-01-18','user@email.com',50,200
union
select '2022-01-15','user@email.com',34,400;
我希望将结果集设置为:
select '2022-01-19','user@email.com',85,100 union
select '2022-01-18','user@email.com',50,200 union
select '2022-01-17','user@email.com',0,200 union --Missing date with 0 score but quota from prior line
select '2022-01-16','user@email.com',0,200 union --Missing date with 0 score but quota from prior line
select '2022-01-15','user@email.com',34,400;
我已经有一个包含所有日期的 table,例如schema.dates。 table 中有成千上万的用户需要我填补这些空白。我尝试了一堆解决方案,但没有任何效果。 我怎样才能做到这一点?
这是一次尝试。如果添加一个用户,它确实有效;我需要多个用户的示例数据,sent_at
日期在不同的日期,以查看它是否适用于许多用户。
WITH
-- your users table after inserting the two rows you insert,
-- and with a previous row dated '2022-01-19' - don't use in query
users(sent_at,user_email,score,quota) AS (
SELECT DATE '2022-01-19','user@email.com',85,100
UNION ALL SELECT DATE '2022-01-18','user@email.com',50,200
UNION ALL SELECT DATE '2022-01-15','user@email.com',34,400
)
,
-- a cutout of the calendar table you mentioned - don't use in query either
calendar(dt) AS (
SELECT DATE '2022-01-14'
UNION ALL SELECT DATE '2022-01-15'
UNION ALL SELECT DATE '2022-01-16'
UNION ALL SELECT DATE '2022-01-17'
UNION ALL SELECT DATE '2022-01-18'
UNION ALL SELECT DATE '2022-01-19'
UNION ALL SELECT DATE '2022-01-20'
)
-- real query starts here, replace following comma with "WITH" ...
,
-- can be done differently, but I like determining filters early ..
limits AS (
SELECT
MIN(sent_at) AS mindt
, MAX(sent_at) AS maxdt
FROM users
)
,
-- fill the time gaps with a left join from calendar to users table
gaps_filled AS (
SELECT
dt AS sent_at
, user_email
, NVL(score,0) AS score
, quota
FROM calendar
LEFT JOIN users ON calendar.dt=users.sent_at
)
-- use LAST_VALUE( .. IGNORE NULLS) to fill the NULL gaps from earlier rows
-- CROSS JOIN with the in-line limits table to get the filter dates
SELECT
sent_at
, LAST_VALUE(gaps_filled.user_email IGNORE NULLS)
OVER(ORDER BY sent_at DESC) AS user_email
, NVL(score,0) AS score
, LAST_VALUE(quota IGNORE NULLS)
OVER(ORDER BY sent_at DESC) AS quota
FROM gaps_filled
CROSS JOIN limits
WHERE sent_at BETWEEN mindt AND maxdt
ORDER BY user_email, sent_at DESC;
-- out Null display is "(null)".
-- out sent_at | user_email | score | quota
-- out ------------+----------------+-------+-------
-- out 2022-01-19 | user@email.com | 85 | 100
-- out 2022-01-18 | user@email.com | 50 | 200
-- out 2022-01-17 | user@email.com | 0 | 200
-- out 2022-01-16 | user@email.com | 0 | 200
-- out 2022-01-15 | user@email.com | 34 | 400
这是一个可行的查询。首先交叉连接日期和用户的所有组合(根据需要添加过滤器),然后左连接用户 table 并使用 last_value() 函数计算配额(请注意,如果您使用的是 Snowflake,则必须按照记录 here):
指定“无限制的前一行和当前行之间的行”with all_dates_users as (
--all combinations of dates and users
select date, user
from dates
cross join (select distinct user_email as user from users)
),
joined as (
--left join users table to the previous
select DU.date, DU.user, U.sent_at, U.user_email, U.score, U.quota
from all_dates_users DU
left join users U on U.sent_at = DU.date and U.user_email = DU.user
)
--calculate quota as previous quota using last_value() function
select date, user, nvl(score, 0) as score, last_value(quota) ignore nulls over (partition by user order by date desc rows between unbounded preceding and current row) as quota
from joined
order by date desc;
Maja 的解决方案使用 WINDOW FUCNTION LAST_VALUE 效果很好,Snowflake 有另一种表达方式,它更短 LAG IGNORE NULL OVER 形式。
WITH dates AS (
SELECT DATEADD(day, ROW_NUMBER() OVER(ORDER BY TRUE),'2022-01-01')::date AS date
FROM TABLE(generator(ROWCOUNT => 50))
), new_data AS (
SELECT * FROM VALUES
('2022-01-19','user@email.com',85,100),
('2022-01-18','user@email.com',50,200),
('2022-01-15','user@email.com',34,400)
v(date, user_email, score , quota )
), users_ranges AS (
SELECT
user_email,
min(date) as min_date,
max(date) as max_date
FROM new_data
GROUP BY 1
), dates_for_users AS (
SELECT d.date,
u.user_email
FROM users_ranges AS u
JOIN dates AS d
ON d.date between u.min_date and u.max_date
)
SELECT du.date,
du.user_email,
ZEROIFNULL(u.score) as score,
NVL(u.quota, LAG(u.quota) IGNORE NULLS OVER(partition by du.user_email order by du.date)) as quota
FROM dates_for_users AS du
LEFT JOIN new_data AS u
ON du.date = u.date AND du.user_email = u.user_email
ORDER BY 1 desc,2;
给出:
DATE | USER_EMAIL | SCORE | QUOTA |
---|---|---|---|
2022-01-19 | user@email.com | 85 | 100 |
2022-01-18 | user@email.com | 50 | 200 |
2022-01-17 | user@email.com | 0 | 400 |
2022-01-16 | user@email.com | 0 | 400 |
2022-01-15 | user@email.com | 34 | 400 |
正如你的问题所描述的那样,你只给出了两行,但希望有一个大的日期范围,但我认为这不是问题的核心,你只是真的想要滞后/LAST_VALUE逻辑。
这由 dates
和 new_data
提供,只是 table 代理。然后 users_ranges
用于查找每个用户的时间范围,该时间范围将用于发生在 dates_for_users
中的 dates
table 的加入日期。现在我们为每个用户填充了所有日期间隙,我们可以左连接到新数据,以获得分数和 zero if null, and use LAG to get the prior QUOTA ignoring nulls, which a little NVL 仅在当前行没有值时才使用 LAG 值(COALESCE 也可以是用于此)。