SQL Select 正确的用户 ID +parent_id 基于 MAX 完成时间戳的配对
SQL Select the correct user-id +parent_id paring based on MAX completed timestamp
以上是我拥有的 table 中数据类型的 table 示例。由于糟糕的输入设计,拥有相同电子邮件的人将有多个 user_id。这可以重复多封电子邮件。上面的例子只展示了一封邮件的状态。
有没有简单的 SQL 解决方案,根据最新的时间戳 select user_id + selected_equivalent_parnt_id?
使用上面的示例,理想的 SQL 输出应该只有 return 行 1 和 2,其中第 1 行和第 2 行中的 user_id 包含最新的 max_completed_on_timestamp 对于 2 个唯一 selected_equivalent_parent_id.
感谢任何关于如何最好地做到这一点的指导,谢谢!
不确定我是否应该使用另一个 MAX() 聚合、分区依据或某种左连接来排除所有其他行。
您分享的数据为:
WITH data as(
SELECT '10183244' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-29 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '10183244' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, '2021-12-29 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '3194303' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-29 19:41:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '2980649' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-28 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '2980649' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, NULL UNION ALL
SELECT '3194303' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, NULL
)
我做了两个子查询,一个是对所有相同的值进行分组,另一个只是为了获取时间戳和 selected_equivalent_parent_id
的最大值
d as(
SELECT user_id, dummy_email, country_name, country_code, region_name, selected_equivalent_parent_id, max(max_completed_on_timestamp) as max_completed_on_timestamp FROM data GROUP BY selected_equivalent_parent_id, user_id, dummy_email, country_code, country_name, region_name
),
t as(
select max(max_completed_on_timestamp) as max_completed_on_timestamp, selected_equivalent_parent_id from data group by selected_equivalent_parent_id
)
然后我加入了两个子查询,它们具有相同的完成时间戳和 selected_equivalent_parent_id
select d.user_id, d.dummy_email, d.country_name, d.country_code, d.region_name, d.selected_equivalent_parent_id, d.max_completed_on_timestamp from t inner join d on d.selected_equivalent_parent_id= t.selected_equivalent_parent_id where d.max_completed_on_timestamp = t.max_completed_on_timestamp
结果如下:
如果您在 table 中有它,您的查询必须像下面的查询一样工作:
WITH d as(
SELECT user_id, dummy_email, country_name, country_code, region_name, selected_equivalent_parent_id, max(max_completed_on_timestamp) as max_completed_on_timestamp FROM `dataset.table` GROUP BY selected_equivalent_parent_id, user_id, dummy_email, country_code, country_name, region_name
),
t as(
select max(max_completed_on_timestamp) as max_completed_on_timestamp, selected_equivalent_parent_id from `dataset.table` group by selected_equivalent_parent_id
)
select d.user_id, d.selected_equivalent_parent_id, d.max_completed_on_timestamp from t inner join d on d.selected_equivalent_parent_id= t.selected_equivalent_parent_id where d.max_completed_on_timestamp = t.max_completed_on_timestamp
本次查询的结果是下一个:
编辑:
将虚拟电子邮件更改为:
WITH data as(
SELECT '10183244' as user_id, '123abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-29 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '10183244' as user_id, 'abcxa@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, '2021-12-29 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '3194303' as user_id, 'abcxa@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-29 19:41:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '2980649' as user_id, 'abcasd@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-28 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '2980649' as user_id, 'sadsabc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, NULL UNION ALL
SELECT '3194303' as user_id, 'tababc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, NULL
),
d as(
SELECT user_id, dummy_email, country_name, country_code, region_name, selected_equivalent_parent_id, max(max_completed_on_timestamp) as max_completed_on_timestamp FROM data GROUP BY selected_equivalent_parent_id, user_id, dummy_email, country_code, country_name, region_name
),
t as(
select max(max_completed_on_timestamp) as max_completed_on_timestamp, selected_equivalent_parent_id from data group by selected_equivalent_parent_id
)
select d.user_id, d.dummy_email, d.country_name, d.country_code, d.region_name, d.selected_equivalent_parent_id, d.max_completed_on_timestamp from t inner join d on d.selected_equivalent_parent_id= t.selected_equivalent_parent_id where d.max_completed_on_timestamp = t.max_completed_on_timestamp
结果是一样的:
考虑以下简单方法
select as value array_agg(t order by max_completed_on_timestamp desc)[offset(0)]
from your_table t
group by selected_equivalent_parent_id
如果适用于您问题中的示例数据 - 输出为
以上是我拥有的 table 中数据类型的 table 示例。由于糟糕的输入设计,拥有相同电子邮件的人将有多个 user_id。这可以重复多封电子邮件。上面的例子只展示了一封邮件的状态。
有没有简单的 SQL 解决方案,根据最新的时间戳 select user_id + selected_equivalent_parnt_id?
使用上面的示例,理想的 SQL 输出应该只有 return 行 1 和 2,其中第 1 行和第 2 行中的 user_id 包含最新的 max_completed_on_timestamp 对于 2 个唯一 selected_equivalent_parent_id.
感谢任何关于如何最好地做到这一点的指导,谢谢!
不确定我是否应该使用另一个 MAX() 聚合、分区依据或某种左连接来排除所有其他行。
您分享的数据为:
WITH data as(
SELECT '10183244' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-29 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '10183244' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, '2021-12-29 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '3194303' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-29 19:41:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '2980649' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-28 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '2980649' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, NULL UNION ALL
SELECT '3194303' as user_id, 'abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, NULL
)
我做了两个子查询,一个是对所有相同的值进行分组,另一个只是为了获取时间戳和 selected_equivalent_parent_id
的最大值d as(
SELECT user_id, dummy_email, country_name, country_code, region_name, selected_equivalent_parent_id, max(max_completed_on_timestamp) as max_completed_on_timestamp FROM data GROUP BY selected_equivalent_parent_id, user_id, dummy_email, country_code, country_name, region_name
),
t as(
select max(max_completed_on_timestamp) as max_completed_on_timestamp, selected_equivalent_parent_id from data group by selected_equivalent_parent_id
)
然后我加入了两个子查询,它们具有相同的完成时间戳和 selected_equivalent_parent_id
select d.user_id, d.dummy_email, d.country_name, d.country_code, d.region_name, d.selected_equivalent_parent_id, d.max_completed_on_timestamp from t inner join d on d.selected_equivalent_parent_id= t.selected_equivalent_parent_id where d.max_completed_on_timestamp = t.max_completed_on_timestamp
结果如下:
如果您在 table 中有它,您的查询必须像下面的查询一样工作:
WITH d as(
SELECT user_id, dummy_email, country_name, country_code, region_name, selected_equivalent_parent_id, max(max_completed_on_timestamp) as max_completed_on_timestamp FROM `dataset.table` GROUP BY selected_equivalent_parent_id, user_id, dummy_email, country_code, country_name, region_name
),
t as(
select max(max_completed_on_timestamp) as max_completed_on_timestamp, selected_equivalent_parent_id from `dataset.table` group by selected_equivalent_parent_id
)
select d.user_id, d.selected_equivalent_parent_id, d.max_completed_on_timestamp from t inner join d on d.selected_equivalent_parent_id= t.selected_equivalent_parent_id where d.max_completed_on_timestamp = t.max_completed_on_timestamp
本次查询的结果是下一个:
编辑:
将虚拟电子邮件更改为:
WITH data as(
SELECT '10183244' as user_id, '123abc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-29 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '10183244' as user_id, 'abcxa@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, '2021-12-29 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '3194303' as user_id, 'abcxa@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-29 19:41:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '2980649' as user_id, 'abcasd@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'303806' as selected_equivalent_parent_id, '2021-12-28 23:22:12.000+08:00' as max_completed_on_timestamp UNION ALL
SELECT '2980649' as user_id, 'sadsabc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, NULL UNION ALL
SELECT '3194303' as user_id, 'tababc@gmail.com' as dummy_email, 'India' as country_name, 'IN' as country_code, 'APAC' as region_name,
'308856' as selected_equivalent_parent_id, NULL
),
d as(
SELECT user_id, dummy_email, country_name, country_code, region_name, selected_equivalent_parent_id, max(max_completed_on_timestamp) as max_completed_on_timestamp FROM data GROUP BY selected_equivalent_parent_id, user_id, dummy_email, country_code, country_name, region_name
),
t as(
select max(max_completed_on_timestamp) as max_completed_on_timestamp, selected_equivalent_parent_id from data group by selected_equivalent_parent_id
)
select d.user_id, d.dummy_email, d.country_name, d.country_code, d.region_name, d.selected_equivalent_parent_id, d.max_completed_on_timestamp from t inner join d on d.selected_equivalent_parent_id= t.selected_equivalent_parent_id where d.max_completed_on_timestamp = t.max_completed_on_timestamp
结果是一样的:
考虑以下简单方法
select as value array_agg(t order by max_completed_on_timestamp desc)[offset(0)]
from your_table t
group by selected_equivalent_parent_id
如果适用于您问题中的示例数据 - 输出为