多列和 mutl table 内连接
Mutli column and mutl table inner join
我需要对 tables 执行内部连接,其中有两个公共列 org_id 和 time_stamp 对通过 Athena
查询的 S3 中的 avro 格式数据
我试过了
SELECT year(from_iso8601_timestamp(em.time_stamp)) time_unit,
sum(em.column1) column1,
sum(spa.column2) column2,
sum(vir.column3) column3
FROM "schemaName".table1 em
JOIN "schemaName".table2 spa
ON year(from_iso8601_timestamp(em.time_stamp)) = year(from_iso8601_timestamp(spa.time_stamp))
AND em.org_id = spa.org_id
JOIN "schemaName".table3 vir
ON year(from_iso8601_timestamp(vir.time_stamp)) = year(from_iso8601_timestamp(spa.time_stamp))
AND vir.org_id = spa.org_id
WHERE em.org_id = 'org_id_test'
AND (from_iso8601_timestamp(em.time_stamp)) <= (cast(from_iso8601_timestamp('2019-11-22T23:59:31') AS timestamp))
AND (from_iso8601_timestamp(em.time_stamp)) >= (cast(from_iso8601_timestamp('2019-11-22T23:59:31') AS timestamp) - interval '10' year)
GROUP BY em.org_id, year(from_iso8601_timestamp(em.time_stamp))
ORDER BY time_unit DESC limit 11
但我得到的有点像交叉连接
results
time_unit |column1 |column2 |column3
1 2019 |48384 |299040 |712
而如果我在相同的 where 条件下分别聚合每个 table,则值显示为
table1
column1
504
table2
column2
280
table3
column3
5
有人可以帮我弄清楚我做错了什么以及实现它的正确方法吗?
如果我没听错,发生的事情是,因为有多个记录匹配每个连接中的条件,所以你最终会多次计算同一条记录你聚合。
解决此问题的典型方法是在子查询中聚合,然后加入。
您可能正在寻找这样的东西:
select
em.time_unit,
em.column1,
spa.column2,
vir.column3
from (
select
org_id,
year(from_iso8601_timestamp(time_stamp)) time_unit,
sum(column1) column1
from "schemaname".table1
group by org_id, year(from_iso8601_timestamp(time_stamp))
) em
join (
select
org_id,
year(from_iso8601_timestamp(time_stamp)) time_unit,
sum(column2) column2
from "schemaname".table2
group by org_id, year(from_iso8601_timestamp(time_stamp))
) spa on spa.time_unit = em.time_unit and spa.org_id = em.org_id
join (
select
org_id,
year(from_iso8601_timestamp(time_stamp)) time_unit,
sum(column3) column3
from "schemaname".table3
group by org_id, year(from_iso8601_timestamp(time_stamp))
) vir on vir.time_unit = em.time_unit and vir.org_id = em.org_id
where
em.org_id = 'org_id_test'
and em.time_unit between 2009 and 2019
order by em.time_unit desc
limit 11
我需要对 tables 执行内部连接,其中有两个公共列 org_id 和 time_stamp 对通过 Athena
查询的 S3 中的 avro 格式数据我试过了
SELECT year(from_iso8601_timestamp(em.time_stamp)) time_unit,
sum(em.column1) column1,
sum(spa.column2) column2,
sum(vir.column3) column3
FROM "schemaName".table1 em
JOIN "schemaName".table2 spa
ON year(from_iso8601_timestamp(em.time_stamp)) = year(from_iso8601_timestamp(spa.time_stamp))
AND em.org_id = spa.org_id
JOIN "schemaName".table3 vir
ON year(from_iso8601_timestamp(vir.time_stamp)) = year(from_iso8601_timestamp(spa.time_stamp))
AND vir.org_id = spa.org_id
WHERE em.org_id = 'org_id_test'
AND (from_iso8601_timestamp(em.time_stamp)) <= (cast(from_iso8601_timestamp('2019-11-22T23:59:31') AS timestamp))
AND (from_iso8601_timestamp(em.time_stamp)) >= (cast(from_iso8601_timestamp('2019-11-22T23:59:31') AS timestamp) - interval '10' year)
GROUP BY em.org_id, year(from_iso8601_timestamp(em.time_stamp))
ORDER BY time_unit DESC limit 11
但我得到的有点像交叉连接
results
time_unit |column1 |column2 |column3
1 2019 |48384 |299040 |712
而如果我在相同的 where 条件下分别聚合每个 table,则值显示为
table1
column1
504
table2
column2
280
table3
column3
5
有人可以帮我弄清楚我做错了什么以及实现它的正确方法吗?
如果我没听错,发生的事情是,因为有多个记录匹配每个连接中的条件,所以你最终会多次计算同一条记录你聚合。
解决此问题的典型方法是在子查询中聚合,然后加入。
您可能正在寻找这样的东西:
select
em.time_unit,
em.column1,
spa.column2,
vir.column3
from (
select
org_id,
year(from_iso8601_timestamp(time_stamp)) time_unit,
sum(column1) column1
from "schemaname".table1
group by org_id, year(from_iso8601_timestamp(time_stamp))
) em
join (
select
org_id,
year(from_iso8601_timestamp(time_stamp)) time_unit,
sum(column2) column2
from "schemaname".table2
group by org_id, year(from_iso8601_timestamp(time_stamp))
) spa on spa.time_unit = em.time_unit and spa.org_id = em.org_id
join (
select
org_id,
year(from_iso8601_timestamp(time_stamp)) time_unit,
sum(column3) column3
from "schemaname".table3
group by org_id, year(from_iso8601_timestamp(time_stamp))
) vir on vir.time_unit = em.time_unit and vir.org_id = em.org_id
where
em.org_id = 'org_id_test'
and em.time_unit between 2009 and 2019
order by em.time_unit desc
limit 11