SQL - Vertica:如何使用最近的日期数据生成每日行
SQL - Vertica: How to generate daily rows with most previous date data
我的基础 table 如下所示:
score_upd (Upd_dt,Url,Score) AS (
SELECT DATE '2019-07-26','A','x'
UNION ALL SELECT DATE '2019-07-26','B','alpha'
UNION ALL SELECT DATE '2019-08-01','A','y'
UNION ALL SELECT DATE '2019-08-01','B','beta'
UNION ALL SELECT DATE '2019-08-03','A','z'
UNION ALL SELECT DATE '2019-08-03','B','gamma'
)
Upd_dt URL Score
2019-07-26 A x
2019-07-26 B alpha
2019-08-01 A y
2019-08-01 B beta
2019-08-03 A z
2019-08-03 B gamma
并且我想在 daily-url 级别创建一个 table,使用最近日期的新行值,结果应如下所示:
score_upd (Upd_dt,Url,Score) AS (
SELECT DATE '2019-07-26','A','x'
UNION ALL SELECT DATE '2019-07-26','B','alpha'
UNION ALL SELECT DATE '2019-07-27','A','x'
UNION ALL SELECT DATE '2019-07-27','B','alpha'
UNION ALL SELECT DATE '2019-07-28','A','x'
UNION ALL SELECT DATE '2019-07-28','B','alpha'
UNION ALL SELECT DATE '2019-07-29','A','x'
UNION ALL SELECT DATE '2019-07-29','B','alpha'
UNION ALL SELECT DATE '2019-07-30','A','x'
UNION ALL SELECT DATE '2019-07-30','B','alpha'
UNION ALL SELECT DATE '2019-07-31','A','x'
UNION ALL SELECT DATE '2019-07-31','B','alpha'
UNION ALL SELECT DATE '2019-08-01','A','y'
UNION ALL SELECT DATE '2019-08-01','B','beta'
UNION ALL SELECT DATE '2019-08-02','A','y'
UNION ALL SELECT DATE '2019-08-02','B','beta'
UNION ALL SELECT DATE '2019-08-03','A','z'
UNION ALL SELECT DATE '2019-08-03','B','gamma'
UNION ALL SELECT DATE '2019-08-04','A','z'
UNION ALL SELECT DATE '2019-08-04','B','gamma'
UNION ALL SELECT DATE '2019-08-05','A','z'
UNION ALL SELECT DATE '2019-08-05','B','gamma'
)
看起来像:
Upd_dt URL Score
2019-07-26 A x
2019-07-26 B alpha
2019-07-27 A x
2019-07-27 B alpha
2019-07-28 A x
2019-07-28 B alpha
2019-07-29 A x
2019-07-29 B alpha
2019-07-30 A x
2019-07-30 B alpha
2019-07-31 A x
2019-07-31 B alpha
2019-08-01 A y
2019-08-01 B beta
2019-08-02 A y
2019-08-02 B beta
2019-08-03 A z
2019-08-03 B gamma
2019-08-04 A z
2019-08-04 B gamma
2019-08-05 A z
2019-08-05 B gamma
.
.
.
当前进程是:
我从 2019 年 7 月 26 日到今天建立了一个每日维度 table:
/*
SELECT CAST(slice_time AS DATE) 日期
来自测试日历 mtc
TIMESERIES slice_time 为“1 天”
OVER (ORDER BY CAST(mtc.dates as TIMESTAMP));
*/
所以我得到:
日期
2019-07-26
2019-07-27
2019-07-28
2019-07-29
.
.
.
2019-10-12(今天)
我在想是否可以使用 "interpolate previous value" 之类的函数按日期加入我的第一个 table,通过使用大多数先前日期数据的值来生成缺失的天数,但它失败了。
结果没有生成缺失天数的行。
如果有人对此有更好的想法,请告诉我。
谢谢!
作为开始警告:仅在确实非常必要时才存储 "daily photograph"。在我过去,我曾经每年有 364 行太多,因为这些值每年只更改一次。在 Vertica 中,这需要许可证、CPU 和用于加入和分组的时钟时间...
但是,剩下的 - 好的开始。
但是您可以应用 TIMESERIES 而无需构建日历。
诀窍是"extrapolate" 手动INTERPOLATE
自动INTERPOLATE
。
添加一个内联 'padding' table,其中包含每个 URL 的最新值,但给它 CURRENT_DATE
而不是最新的实际日期 - 使用 Vertica 的奇特的解析极限子句 LIMIT 1 OVER(PARTITION BY url ORDER BY upd_dt DESC)
.
UNION SELECT 用您的输入填充 table,并将 TIMESERIES 子句应用于该 UNION SELECT。
像这样:
WITH
-- your input ...
score_upd (Upd_dt,Url,Score) AS (
SELECT DATE '2019-07-26','A','x'
UNION ALL SELECT DATE '2019-07-26','B','alpha'
UNION ALL SELECT DATE '2019-08-01','A','y'
UNION ALL SELECT DATE '2019-08-01','B','beta'
UNION ALL SELECT DATE '2019-08-03','A','z'
UNION ALL SELECT DATE '2019-08-03','B','gamma'
)
-- real WITH clause would start here ...
,
-- newest row per Url, just with current date
pad_newest AS (
SELECT
CURRENT_DATE
, url
, score
FROM score_upd
LIMIT 1 OVER(PARTITION BY url ORDER BY upd_dt DESC)
)
,
with_newest AS (
SELECT
*
FROM score_upd
UNION ALL
SELECT *
FROM pad_newest
)
SELECT
ts_dt::DATE AS upd_dt
, url AS url
, TS_FIRST_VALUE(score) AS score
FROM with_newest
TIMESERIES ts_dt AS '1 day' OVER (
PARTITION BY url ORDER BY upd_dt::TIMESTAMP
)
ORDER BY 1,2
;
我的基础 table 如下所示:
score_upd (Upd_dt,Url,Score) AS (
SELECT DATE '2019-07-26','A','x'
UNION ALL SELECT DATE '2019-07-26','B','alpha'
UNION ALL SELECT DATE '2019-08-01','A','y'
UNION ALL SELECT DATE '2019-08-01','B','beta'
UNION ALL SELECT DATE '2019-08-03','A','z'
UNION ALL SELECT DATE '2019-08-03','B','gamma'
)
Upd_dt URL Score
2019-07-26 A x
2019-07-26 B alpha
2019-08-01 A y
2019-08-01 B beta
2019-08-03 A z
2019-08-03 B gamma
并且我想在 daily-url 级别创建一个 table,使用最近日期的新行值,结果应如下所示:
score_upd (Upd_dt,Url,Score) AS (
SELECT DATE '2019-07-26','A','x'
UNION ALL SELECT DATE '2019-07-26','B','alpha'
UNION ALL SELECT DATE '2019-07-27','A','x'
UNION ALL SELECT DATE '2019-07-27','B','alpha'
UNION ALL SELECT DATE '2019-07-28','A','x'
UNION ALL SELECT DATE '2019-07-28','B','alpha'
UNION ALL SELECT DATE '2019-07-29','A','x'
UNION ALL SELECT DATE '2019-07-29','B','alpha'
UNION ALL SELECT DATE '2019-07-30','A','x'
UNION ALL SELECT DATE '2019-07-30','B','alpha'
UNION ALL SELECT DATE '2019-07-31','A','x'
UNION ALL SELECT DATE '2019-07-31','B','alpha'
UNION ALL SELECT DATE '2019-08-01','A','y'
UNION ALL SELECT DATE '2019-08-01','B','beta'
UNION ALL SELECT DATE '2019-08-02','A','y'
UNION ALL SELECT DATE '2019-08-02','B','beta'
UNION ALL SELECT DATE '2019-08-03','A','z'
UNION ALL SELECT DATE '2019-08-03','B','gamma'
UNION ALL SELECT DATE '2019-08-04','A','z'
UNION ALL SELECT DATE '2019-08-04','B','gamma'
UNION ALL SELECT DATE '2019-08-05','A','z'
UNION ALL SELECT DATE '2019-08-05','B','gamma'
)
看起来像:
Upd_dt URL Score
2019-07-26 A x
2019-07-26 B alpha
2019-07-27 A x
2019-07-27 B alpha
2019-07-28 A x
2019-07-28 B alpha
2019-07-29 A x
2019-07-29 B alpha
2019-07-30 A x
2019-07-30 B alpha
2019-07-31 A x
2019-07-31 B alpha
2019-08-01 A y
2019-08-01 B beta
2019-08-02 A y
2019-08-02 B beta
2019-08-03 A z
2019-08-03 B gamma
2019-08-04 A z
2019-08-04 B gamma
2019-08-05 A z
2019-08-05 B gamma
.
.
.
当前进程是: 我从 2019 年 7 月 26 日到今天建立了一个每日维度 table:
/* SELECT CAST(slice_time AS DATE) 日期 来自测试日历 mtc TIMESERIES slice_time 为“1 天” OVER (ORDER BY CAST(mtc.dates as TIMESTAMP)); */
所以我得到:
日期
2019-07-26
2019-07-27
2019-07-28
2019-07-29
.
.
.
2019-10-12(今天)
我在想是否可以使用 "interpolate previous value" 之类的函数按日期加入我的第一个 table,通过使用大多数先前日期数据的值来生成缺失的天数,但它失败了。
结果没有生成缺失天数的行。
如果有人对此有更好的想法,请告诉我。
谢谢!
作为开始警告:仅在确实非常必要时才存储 "daily photograph"。在我过去,我曾经每年有 364 行太多,因为这些值每年只更改一次。在 Vertica 中,这需要许可证、CPU 和用于加入和分组的时钟时间...
但是,剩下的 - 好的开始。
但是您可以应用 TIMESERIES 而无需构建日历。
诀窍是"extrapolate" 手动INTERPOLATE
自动INTERPOLATE
。
添加一个内联 'padding' table,其中包含每个 URL 的最新值,但给它 CURRENT_DATE
而不是最新的实际日期 - 使用 Vertica 的奇特的解析极限子句 LIMIT 1 OVER(PARTITION BY url ORDER BY upd_dt DESC)
.
UNION SELECT 用您的输入填充 table,并将 TIMESERIES 子句应用于该 UNION SELECT。
像这样:
WITH
-- your input ...
score_upd (Upd_dt,Url,Score) AS (
SELECT DATE '2019-07-26','A','x'
UNION ALL SELECT DATE '2019-07-26','B','alpha'
UNION ALL SELECT DATE '2019-08-01','A','y'
UNION ALL SELECT DATE '2019-08-01','B','beta'
UNION ALL SELECT DATE '2019-08-03','A','z'
UNION ALL SELECT DATE '2019-08-03','B','gamma'
)
-- real WITH clause would start here ...
,
-- newest row per Url, just with current date
pad_newest AS (
SELECT
CURRENT_DATE
, url
, score
FROM score_upd
LIMIT 1 OVER(PARTITION BY url ORDER BY upd_dt DESC)
)
,
with_newest AS (
SELECT
*
FROM score_upd
UNION ALL
SELECT *
FROM pad_newest
)
SELECT
ts_dt::DATE AS upd_dt
, url AS url
, TS_FIRST_VALUE(score) AS score
FROM with_newest
TIMESERIES ts_dt AS '1 day' OVER (
PARTITION BY url ORDER BY upd_dt::TIMESTAMP
)
ORDER BY 1,2
;