SQL 在时间范围内左加入 Window
SQL Left Join Within Timeframe Window
我有两个数据集:
dataset_a
time_stamp user group value
2021-06-20 12:48:24.521 A video 1
2021-06-15 12:50:24.521 A video 1
2021-06-10 12:48:24.521 A video 1
dataset_b
time_stamp user group label
2021-06-20 09:40:24.521 A video BA
2021-06-19 13:30:24.521 A video BB
2021-06-13 12:48:24.521 A video BC
2021-06-09 12:55:24.521 A video BD
我想创建一个数据集,如果数据集 b 在时间戳、用户和组方面与数据集 a 的时间戳相差 1 天以内,则它是一个匹配项。以前有没有人做过类似的事情 left join on dataset_b.timestamp between dataset_a.timestamp and date_add(dataset_a.timestamp,-1)
。我希望有灵活性,将来我也可以测试 -7 天,这样它就很容易修改。
预期输出如下:
dataset_a
time_stamp user group value timestamp_b label
2021-06-20 12:48:24.521 A video 0.5 2021-06-20 09:40:24.521 BA
2021-06-20 12:48:24.521 A video 0.5 2021-06-19 13:30:24.521 BB
2021-06-15 12:50:24.521 A video 1 NULL NULL
2021-06-10 12:48:24.521 A video 1 2021-06-09 12:55:24.521 BD
JOIN
条件不必只是相等运算符,因此:
SELECT *
FROM dataset_a
LEFT JOIN dataset_b
ON dataset_b.user = dataset_a.user
AND dataset_b.group = dataset_a.group
AND dataset_b.time_stamp BETWEEN dataset_a.time_stamp - INTERVAL '1 day'
AND dataset_a.time_stamp ;
是一个有效的连接。
一些调整...与 Lukasz 一样,您现在可以在雪花中 copy/paste 和 运行 :-)
with dataset_a as (
select '2021-06-20 12:48:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user,
'video' groups,1 value
union all select '2021-06-15 12:50:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user, 'video' groups,1 value
union all select '2021-06-10 12:48:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user, 'video' groups,1 value
) , dataset_b as(
select '2021-06-19 13:30:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BB' label
union all select '2021-06-13 12:48:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BC' label
union all select '2021-06-09 12:55:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BD' label
union all select '2021-06-20 09:40:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BA' label)
SELECT *
FROM dataset_a
LEFT JOIN dataset_b
ON dataset_b.user = dataset_a.user
AND dataset_b.groups = dataset_a.groups
AND dataset_b.time_stamp between dataset_a.time_stamp - INTERVAL '1 day'
and dataset_a.time_stamp ;
添加了 avg(value) 来清理 dups ... 或者只需在您的密钥上添加窗口化的 avg avg(dataset_a.value) over (partition by dataset_a.time_stamp, dataset_a.user, dataset_a.groups , dataset_b.user)
with dataset_a as (
select '2021-06-20 12:48:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user,
'video' groups,1 value
union all select '2021-06-15 12:50:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user, 'video' groups,1 value
union all select '2021-06-10 12:48:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user, 'video' groups,1 value
) , dataset_b as(
select '2021-06-19 13:30:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BB' label
union all select '2021-06-13 12:48:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BC' label
union all select '2021-06-09 12:55:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BD' label
union all select '2021-06-20 09:40:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BA' label)
SELECT dataset_a.time_stamp, dataset_a.user, dataset_a.groups, avg(dataset_a.value), dataset_b.time_stamp, dataset_b.user, dataset_b.groups,dataset_b.label
FROM dataset_a
LEFT JOIN dataset_b
ON dataset_b.user = dataset_a.user
AND dataset_b.groups = dataset_a.groups
AND dataset_b.time_stamp between dataset_a.time_stamp - INTERVAL '1 day'
and dataset_a.time_stamp
group by 1,2,3,5,6,7,8
我有两个数据集:
dataset_a
time_stamp user group value
2021-06-20 12:48:24.521 A video 1
2021-06-15 12:50:24.521 A video 1
2021-06-10 12:48:24.521 A video 1
dataset_b
time_stamp user group label
2021-06-20 09:40:24.521 A video BA
2021-06-19 13:30:24.521 A video BB
2021-06-13 12:48:24.521 A video BC
2021-06-09 12:55:24.521 A video BD
我想创建一个数据集,如果数据集 b 在时间戳、用户和组方面与数据集 a 的时间戳相差 1 天以内,则它是一个匹配项。以前有没有人做过类似的事情 left join on dataset_b.timestamp between dataset_a.timestamp and date_add(dataset_a.timestamp,-1)
。我希望有灵活性,将来我也可以测试 -7 天,这样它就很容易修改。
预期输出如下:
dataset_a
time_stamp user group value timestamp_b label
2021-06-20 12:48:24.521 A video 0.5 2021-06-20 09:40:24.521 BA
2021-06-20 12:48:24.521 A video 0.5 2021-06-19 13:30:24.521 BB
2021-06-15 12:50:24.521 A video 1 NULL NULL
2021-06-10 12:48:24.521 A video 1 2021-06-09 12:55:24.521 BD
JOIN
条件不必只是相等运算符,因此:
SELECT *
FROM dataset_a
LEFT JOIN dataset_b
ON dataset_b.user = dataset_a.user
AND dataset_b.group = dataset_a.group
AND dataset_b.time_stamp BETWEEN dataset_a.time_stamp - INTERVAL '1 day'
AND dataset_a.time_stamp ;
是一个有效的连接。
一些调整...与 Lukasz 一样,您现在可以在雪花中 copy/paste 和 运行 :-)
with dataset_a as (
select '2021-06-20 12:48:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user,
'video' groups,1 value
union all select '2021-06-15 12:50:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user, 'video' groups,1 value
union all select '2021-06-10 12:48:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user, 'video' groups,1 value
) , dataset_b as(
select '2021-06-19 13:30:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BB' label
union all select '2021-06-13 12:48:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BC' label
union all select '2021-06-09 12:55:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BD' label
union all select '2021-06-20 09:40:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BA' label)
SELECT *
FROM dataset_a
LEFT JOIN dataset_b
ON dataset_b.user = dataset_a.user
AND dataset_b.groups = dataset_a.groups
AND dataset_b.time_stamp between dataset_a.time_stamp - INTERVAL '1 day'
and dataset_a.time_stamp ;
添加了 avg(value) 来清理 dups ... 或者只需在您的密钥上添加窗口化的 avg avg(dataset_a.value) over (partition by dataset_a.time_stamp, dataset_a.user, dataset_a.groups , dataset_b.user)
with dataset_a as (
select '2021-06-20 12:48:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user,
'video' groups,1 value
union all select '2021-06-15 12:50:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user, 'video' groups,1 value
union all select '2021-06-10 12:48:24.521'::TIMESTAMP_LTZ time_stamp, 'A' user, 'video' groups,1 value
) , dataset_b as(
select '2021-06-19 13:30:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BB' label
union all select '2021-06-13 12:48:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BC' label
union all select '2021-06-09 12:55:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BD' label
union all select '2021-06-20 09:40:24.521'::TIMESTAMP_LTZ time_stamp,'A' user,'video' groups,'BA' label)
SELECT dataset_a.time_stamp, dataset_a.user, dataset_a.groups, avg(dataset_a.value), dataset_b.time_stamp, dataset_b.user, dataset_b.groups,dataset_b.label
FROM dataset_a
LEFT JOIN dataset_b
ON dataset_b.user = dataset_a.user
AND dataset_b.groups = dataset_a.groups
AND dataset_b.time_stamp between dataset_a.time_stamp - INTERVAL '1 day'
and dataset_a.time_stamp
group by 1,2,3,5,6,7,8