如何填写缺失的日期

Question

我有一个table如下：

create table users as(sent_at date, user_email varchar(128), score int, quota int);

insert into users 
select '2022-01-18','user@email.com',50,200 
union
select '2022-01-15','user@email.com',34,400;

我希望将结果集设置为：

select '2022-01-19','user@email.com',85,100 union
select '2022-01-18','user@email.com',50,200 union
select '2022-01-17','user@email.com',0,200 union        --Missing date with 0 score but quota from prior line
select '2022-01-16','user@email.com',0,200 union        --Missing date with 0 score but quota from prior line
select '2022-01-15','user@email.com',34,400;

我已经有一个包含所有日期的 table，例如schema.dates。 table 中有成千上万的用户需要我填补这些空白。我尝试了一堆解决方案，但没有任何效果。我怎样才能做到这一点？

Answer 1

这是一次尝试。如果添加一个用户，它确实有效；我需要多个用户的示例数据，sent_at 日期在不同的日期，以查看它是否适用于许多用户。

WITH
-- your users table after inserting the two rows you insert, 
-- and with a previous row dated '2022-01-19' - don't use in query
users(sent_at,user_email,score,quota) AS (
            SELECT DATE '2022-01-19','user@email.com',85,100
  UNION ALL SELECT DATE '2022-01-18','user@email.com',50,200
  UNION ALL SELECT DATE '2022-01-15','user@email.com',34,400
)
,
-- a cutout of the calendar table you mentioned - don't use in query either
calendar(dt) AS (
            SELECT DATE '2022-01-14'
  UNION ALL SELECT DATE '2022-01-15'
  UNION ALL SELECT DATE '2022-01-16'
  UNION ALL SELECT DATE '2022-01-17'
  UNION ALL SELECT DATE '2022-01-18'
  UNION ALL SELECT DATE '2022-01-19'
  UNION ALL SELECT DATE '2022-01-20'
)
-- real query starts here, replace following comma with "WITH" ...
,
-- can be done differently, but I like determining filters early ..
limits AS (
  SELECT
    MIN(sent_at) AS mindt
  , MAX(sent_at) AS maxdt
  FROM users
)
,
-- fill the time gaps with a left join from calendar to users table
gaps_filled AS (
  SELECT
    dt AS sent_at
  , user_email
  , NVL(score,0) AS score
  , quota
  FROM calendar
  LEFT JOIN users ON calendar.dt=users.sent_at
)
-- use LAST_VALUE( .. IGNORE NULLS) to fill the NULL gaps from earlier rows
-- CROSS JOIN with the in-line limits table to get the filter dates
SELECT
  sent_at
, LAST_VALUE(gaps_filled.user_email IGNORE NULLS) 
     OVER(ORDER BY sent_at DESC) AS user_email 
, NVL(score,0)  AS score
, LAST_VALUE(quota IGNORE NULLS) 
     OVER(ORDER BY sent_at DESC) AS quota 
FROM gaps_filled
CROSS JOIN limits
WHERE sent_at BETWEEN mindt AND maxdt
ORDER BY user_email, sent_at DESC;
-- out Null display is "(null)".
-- out   sent_at   |   user_email   | score | quota 
-- out ------------+----------------+-------+-------
-- out  2022-01-19 | user@email.com |    85 |   100
-- out  2022-01-18 | user@email.com |    50 |   200
-- out  2022-01-17 | user@email.com |     0 |   200
-- out  2022-01-16 | user@email.com |     0 |   200
-- out  2022-01-15 | user@email.com |    34 |   400

Answer 2

这是一个可行的查询。首先交叉连接日期和用户的所有组合（根据需要添加过滤器），然后左连接用户 table 并使用 last_value() 函数计算配额（请注意，如果您使用的是 Snowflake，则必须按照记录 here):

指定“无限制的前一行和当前行之间的行”

with all_dates_users as (
--all combinations of dates and users
select date, user
from dates
cross join (select distinct user_email as user from users)
),
joined as (
--left join users table to the previous
select DU.date, DU.user, U.sent_at, U.user_email, U.score, U.quota
from all_dates_users DU
left join users U on U.sent_at = DU.date and U.user_email = DU.user
)
--calculate quota as previous quota using last_value() function
select date, user, nvl(score, 0) as score, last_value(quota) ignore nulls over (partition by user order by date desc rows between unbounded preceding and current row) as quota
from joined
order by date desc;

Answer 3

Maja 的解决方案使用 WINDOW FUCNTION LAST_VALUE 效果很好，Snowflake 有另一种表达方式，它更短 LAG IGNORE NULL OVER 形式。

WITH dates AS (
    SELECT DATEADD(day, ROW_NUMBER() OVER(ORDER BY TRUE),'2022-01-01')::date AS date 
    FROM TABLE(generator(ROWCOUNT => 50))
), new_data AS (
    SELECT * FROM VALUES
     ('2022-01-19','user@email.com',85,100),
     ('2022-01-18','user@email.com',50,200),
     ('2022-01-15','user@email.com',34,400)
    v(date, user_email, score , quota )
), users_ranges AS (
    SELECT 
        user_email, 
        min(date) as min_date, 
        max(date) as max_date
    FROM new_data
    GROUP BY 1
), dates_for_users AS (
    SELECT d.date,
        u.user_email
    FROM users_ranges AS u
    JOIN dates AS d 
        ON d.date between u.min_date and u.max_date    
)
SELECT du.date,
    du.user_email,
    ZEROIFNULL(u.score) as score,
    NVL(u.quota, LAG(u.quota) IGNORE NULLS OVER(partition by du.user_email order by du.date)) as quota
FROM dates_for_users AS du
LEFT JOIN new_data AS u
    ON du.date = u.date AND du.user_email = u.user_email
ORDER BY 1 desc,2;

给出：

DATE	USER_EMAIL	SCORE	QUOTA
2022-01-19	user@email.com	85	100
2022-01-18	user@email.com	50	200
2022-01-17	user@email.com	0	400
2022-01-16	user@email.com	0	400
2022-01-15	user@email.com	34	400

正如你的问题所描述的那样，你只给出了两行，但希望有一个大的日期范围，但我认为这不是问题的核心，你只是真的想要滞后/LAST_VALUE逻辑。

这由 dates 和 new_data 提供，只是 table 代理。然后 users_ranges 用于查找每个用户的时间范围，该时间范围将用于发生在 dates_for_users 中的 dates table 的加入日期。现在我们为每个用户填充了所有日期间隙，我们可以左连接到新数据，以获得分数和 zero if null, and use LAG to get the prior QUOTA ignoring nulls, which a little NVL 仅在当前行没有值时才使用 LAG 值（COALESCE 也可以是用于此）。

如何填写缺失的日期

How to fill in missing dates

sql

snowflake-schema

snowflake-cloud-data-platform