用以前的非空值替换空值
Replace nulls with the previous non-null value
我正在使用基于 Presto 0.172 的 Amazon Athena 引擎版本 1。
考虑示例数据集:
id
date_column
col1
1
01/03/2021
NULL
1
02/03/2021
1
1
15/03/2021
2
1
16/03/2021
NULL
1
17/03/2021
NULL
1
30/03/2021
NULL
1
30/03/2021
1
1
31/03/2021
NULL
我想用最后一个非 NULL 值替换 table 中的所有 NULL,即我想得到:
id
date_column
col1
1
01/03/2021
NULL
1
02/03/2021
1
1
15/03/2021
2
1
16/03/2021
2
1
17/03/2021
2
1
30/03/2021
1
1
30/03/2021
1
1
31/03/2021
1
我正在考虑使用带有 IGNORE NULLS
选项的 lag
函数,但不幸的是,Athena 引擎版本 1 不支持 IGNORE NULLS
(它也不被 Athena 引擎版本支持2, 基于 Presto 0.217).
如何在不使用 IGNORE NULLS
选项的情况下实现所需的格式?
这里是生成示例的一些模板table:
WITH source1 AS (
SELECT
*
FROM (
VALUES
(1, date('2021-03-01'), NULL),
(1, date('2021-03-02'), 1),
(1, date('2021-03-15'), 2),
(1, date('2021-03-16'), NULL),
(1, date('2021-03-17'), NULL),
(1, date('2021-03-30'), NULL),
(1, date('2021-03-30'), 1),
(1, date('2021-03-31'), NULL)
) AS t (id, date_col, col1)
)
SELECT
id
, date_col
, col1
-- This doesn't work as IGNORE NULLS is not supported.
-- CASE
-- WHEN col1 IS NOT NULL THEN col1
-- ELSE lag(col1) OVER IGNORE NULLS (PARTITION BY id ORDER BY date_col)
-- END AS col1_lag_nulls_ignored
FROM
source1
ORDER BY
date_co
在查看关于 SO (here and here) 的类似问题后,以下解决方案适用于所有列类型(包括字符串和日期):
WITH source1 AS (
SELECT
*
FROM (
VALUES
(1, date('2021-03-01'), NULL),
(1, date('2021-03-02'), 1),
(1, date('2021-03-15'), 2),
(1, date('2021-03-16'), NULL),
(1, date('2021-03-17'), NULL),
(1, date('2021-03-30'), 1),
(1, date('2021-03-31'), NULL)
) AS t (id, date_col, col1)
)
, grouped AS (
SELECT
id
, date_col
, col1
-- If the row has a value in a column, then this row and all subsequent rows
-- with a NULL (before the next non-NULL value) will be in the same group.
, sum(CASE WHEN col1 IS NULL THEN 0 ELSE 1 END) OVER (
PARTITION BY id ORDER BY date_col) AS grp
FROM
source1
)
SELECT
id
, date_col
, col1
-- max is used instead of first_value, since in cases where there will
-- be multiple records with NULL on the same date, the first_value may
-- still return a NULL.
, max(col1) OVER (PARTITION BY id, grp ORDER BY date_col) AS col1_filled
, grp
FROM
grouped
ORDER BY
date_col
我正在使用基于 Presto 0.172 的 Amazon Athena 引擎版本 1。
考虑示例数据集:
id | date_column | col1 |
---|---|---|
1 | 01/03/2021 | NULL |
1 | 02/03/2021 | 1 |
1 | 15/03/2021 | 2 |
1 | 16/03/2021 | NULL |
1 | 17/03/2021 | NULL |
1 | 30/03/2021 | NULL |
1 | 30/03/2021 | 1 |
1 | 31/03/2021 | NULL |
我想用最后一个非 NULL 值替换 table 中的所有 NULL,即我想得到:
id | date_column | col1 |
---|---|---|
1 | 01/03/2021 | NULL |
1 | 02/03/2021 | 1 |
1 | 15/03/2021 | 2 |
1 | 16/03/2021 | 2 |
1 | 17/03/2021 | 2 |
1 | 30/03/2021 | 1 |
1 | 30/03/2021 | 1 |
1 | 31/03/2021 | 1 |
我正在考虑使用带有 IGNORE NULLS
选项的 lag
函数,但不幸的是,Athena 引擎版本 1 不支持 IGNORE NULLS
(它也不被 Athena 引擎版本支持2, 基于 Presto 0.217).
如何在不使用 IGNORE NULLS
选项的情况下实现所需的格式?
这里是生成示例的一些模板table:
WITH source1 AS (
SELECT
*
FROM (
VALUES
(1, date('2021-03-01'), NULL),
(1, date('2021-03-02'), 1),
(1, date('2021-03-15'), 2),
(1, date('2021-03-16'), NULL),
(1, date('2021-03-17'), NULL),
(1, date('2021-03-30'), NULL),
(1, date('2021-03-30'), 1),
(1, date('2021-03-31'), NULL)
) AS t (id, date_col, col1)
)
SELECT
id
, date_col
, col1
-- This doesn't work as IGNORE NULLS is not supported.
-- CASE
-- WHEN col1 IS NOT NULL THEN col1
-- ELSE lag(col1) OVER IGNORE NULLS (PARTITION BY id ORDER BY date_col)
-- END AS col1_lag_nulls_ignored
FROM
source1
ORDER BY
date_co
在查看关于 SO (here and here) 的类似问题后,以下解决方案适用于所有列类型(包括字符串和日期):
WITH source1 AS (
SELECT
*
FROM (
VALUES
(1, date('2021-03-01'), NULL),
(1, date('2021-03-02'), 1),
(1, date('2021-03-15'), 2),
(1, date('2021-03-16'), NULL),
(1, date('2021-03-17'), NULL),
(1, date('2021-03-30'), 1),
(1, date('2021-03-31'), NULL)
) AS t (id, date_col, col1)
)
, grouped AS (
SELECT
id
, date_col
, col1
-- If the row has a value in a column, then this row and all subsequent rows
-- with a NULL (before the next non-NULL value) will be in the same group.
, sum(CASE WHEN col1 IS NULL THEN 0 ELSE 1 END) OVER (
PARTITION BY id ORDER BY date_col) AS grp
FROM
source1
)
SELECT
id
, date_col
, col1
-- max is used instead of first_value, since in cases where there will
-- be multiple records with NULL on the same date, the first_value may
-- still return a NULL.
, max(col1) OVER (PARTITION BY id, grp ORDER BY date_col) AS col1_filled
, grp
FROM
grouped
ORDER BY
date_col