Hive SQL 使用 Where 子句的完全外部联接
Hive SQL Full Outer Join with Where Clause
我正在创建一个带有 where 子句的完整外部联接。但是,它只能生成内部连接结果。我怀疑这是由于 where 子句,但我确实需要添加 where 条件。那么我怎样才能创建一个同时满足两个需求(where 条件和完全外连接)的查询呢?这是我的查询。
select
t1.key1 as key1_1
, t1.key2 as key2_1
, t1.key3 as key3_1
, t1.date as date_1
, t1.v1
, t2.key1 as key1_2
, t2.key2 as key2_2
, t2.key3 as key3_2
, t2.date as date_2
, t2.v2
from t1
full outer join t2
on t1.key1 = t2.key1 and t1.key2 = t2.key2 and t1.key3 = t2.key3
where datediff(t1.date, t2.date) between -5 and 5
;
示例数据
t1
key1 key2 key3 date v1
A1 B1 C1 2015-01-01 10
A1 B2 C2 2015-01-01 11
t2
key1 key2 key3 date v2
A1 B1 C1 2015-01-01 20
A1 B1 C1 2015-01-03 30
A1 B1 C1 2015-02-01 40
A1 B1 C1 50
A1 B1 C2 2015-01-02 60
想要的结果
key1_1 key2_1 key3_1 date_1 v1 key1_2 key2_2 key3_2 date_2 v2
A1 B1 C1 2015-01-01 10 A1 B1 C1 2015-01-01 20
A1 B1 C1 2015-01-01 10 A1 B1 C1 2015-01-03 30
A1 B1 C1 2015-02-01 40
A1 B1 C1 50
A1 B1 C2 2015-01-02 60
A1 B2 C2 2015-01-01 11
这些都是我现在能想到的场景。如果我发现任何遗漏的场景,我可以添加。我的意思是应该包括以下结果:
- 如果两个 table 满足使用键和日期设置的所有条件,则它会包含在所需结果中,如第 1 行和第 2 行所示。
- 如果这些条件中的任何一个不满足,那么我们将在结果中保留一个 table 的信息,如所需结果中的第 3、4、5 和 6 行所示。
编辑:
根据@Gordon Linoff 的建议,我使用了 union all 来解决这个问题。请在下面的回答 post 中查看我的解决方案。
您可能只想将逻辑移至 on
子句:
from t1 full outer join
t2
on t1.key1 = t2.key1 and
t1.key2 = t2.key2 and
t1.key3 = t2.key3 and
datediff(t1.date, t2.date) between -5 and 5
编辑:
如果上述方法不起作用,那么也许您可以将查询重写为 union all
:
select . . .
from t1 join
t2
on t1.key1 = t2.key1 and
t1.key2 = t2.key2 and
t1.key3 = t2.key3
where datediff(t1.date, t2.date) between -5 and 5
union all
select . . .
from t1
where not exists (select 1
from t2
where t1.key1 = t2.key1 and
t1.key2 = t2.key2 and
t1.key3 = t2.key3 and
datediff(t1.date, t2.date) between -5 and 5
)
union all
select . . .
from t2
where not exists (select 1
from t1
where t1.key1 = t2.key1 and
t1.key2 = t2.key2 and
t1.key3 = t2.key3 and
datediff(t1.date, t2.date) between -5 and 5
);
我也不是 100% 确定 Hive 会接受这些关联子句。
正如您已经意识到的那样,问题是 where
迫使 t1.date
和 t2.date
存在。您只需要避免这种假设,例如:
(t1.date is null) or (t2.date is null) or (datediff(t1.date, t2.date) between -5 and 5)
这是我根据@Gordon Linoff 在讨论环节中的建议对我自己的问题的解决方案。
create table t3 as
select *, row_number () over () as id from t1;
create table t4 as
select *, row_number () over () as id from t2;
create table t5 as
select
t1.id as id_1
, t1.key1 as key1_1
, t1.key2 as key2_1
, t1.key3 as key3_1
, t1.date as date_1
, t1.v1
, t2.id as id_2
, t2.key1 as key1_2
, t2.key2 as key2_2
, t2.key3 as key3_2
, t2.date as date_2
, t2.v2
from t3 as t1
full outer join t4 as t2
on t1.key1 = t2.key1 and t1.key2 = t2.key2 and t1.key3 = t2.key3
where datediff(t1.date, t2.date) between -5 and 5
;
set hive.mapred.mode=nonstrict;
create table t6 as
select
t1.id as id_1
, t1.key1 as key1_1
, t1.key2 as key2_1
, t1.key3 as key3_1
, t1.date as date_1
, t1.v1
, null as id_2
, null as key1_2
, null as key2_2
, null as key3_2
, null as date_2
, null as v2
from t3 as t1
where t1.id not in (select t2.id_1 from t5 as t2 where t2.id_1 is not null)
;
create table t7 as
select
null as id_1
, null as key1_1
, null as key2_1
, null as key3_1
, null as date_1
, null as v1
, t1.id as id_2
, t1.key1 key1_2
, t1.key2 key2_2
, t1.key3 key3_2
, t1.date date_2
, t1.v2
from t4 as t1
where t1.id not in (select t2.id_2 from t5 as t2 where t2.id_2 is not null)
;
create table t8 as
select * from t5 union all
select * from t6 union all
select * from t7
;
我正在创建一个带有 where 子句的完整外部联接。但是,它只能生成内部连接结果。我怀疑这是由于 where 子句,但我确实需要添加 where 条件。那么我怎样才能创建一个同时满足两个需求(where 条件和完全外连接)的查询呢?这是我的查询。
select
t1.key1 as key1_1
, t1.key2 as key2_1
, t1.key3 as key3_1
, t1.date as date_1
, t1.v1
, t2.key1 as key1_2
, t2.key2 as key2_2
, t2.key3 as key3_2
, t2.date as date_2
, t2.v2
from t1
full outer join t2
on t1.key1 = t2.key1 and t1.key2 = t2.key2 and t1.key3 = t2.key3
where datediff(t1.date, t2.date) between -5 and 5
;
示例数据
t1
key1 key2 key3 date v1
A1 B1 C1 2015-01-01 10
A1 B2 C2 2015-01-01 11
t2
key1 key2 key3 date v2
A1 B1 C1 2015-01-01 20
A1 B1 C1 2015-01-03 30
A1 B1 C1 2015-02-01 40
A1 B1 C1 50
A1 B1 C2 2015-01-02 60
想要的结果
key1_1 key2_1 key3_1 date_1 v1 key1_2 key2_2 key3_2 date_2 v2
A1 B1 C1 2015-01-01 10 A1 B1 C1 2015-01-01 20
A1 B1 C1 2015-01-01 10 A1 B1 C1 2015-01-03 30
A1 B1 C1 2015-02-01 40
A1 B1 C1 50
A1 B1 C2 2015-01-02 60
A1 B2 C2 2015-01-01 11
这些都是我现在能想到的场景。如果我发现任何遗漏的场景,我可以添加。我的意思是应该包括以下结果:
- 如果两个 table 满足使用键和日期设置的所有条件,则它会包含在所需结果中,如第 1 行和第 2 行所示。
- 如果这些条件中的任何一个不满足,那么我们将在结果中保留一个 table 的信息,如所需结果中的第 3、4、5 和 6 行所示。
编辑: 根据@Gordon Linoff 的建议,我使用了 union all 来解决这个问题。请在下面的回答 post 中查看我的解决方案。
您可能只想将逻辑移至 on
子句:
from t1 full outer join
t2
on t1.key1 = t2.key1 and
t1.key2 = t2.key2 and
t1.key3 = t2.key3 and
datediff(t1.date, t2.date) between -5 and 5
编辑:
如果上述方法不起作用,那么也许您可以将查询重写为 union all
:
select . . .
from t1 join
t2
on t1.key1 = t2.key1 and
t1.key2 = t2.key2 and
t1.key3 = t2.key3
where datediff(t1.date, t2.date) between -5 and 5
union all
select . . .
from t1
where not exists (select 1
from t2
where t1.key1 = t2.key1 and
t1.key2 = t2.key2 and
t1.key3 = t2.key3 and
datediff(t1.date, t2.date) between -5 and 5
)
union all
select . . .
from t2
where not exists (select 1
from t1
where t1.key1 = t2.key1 and
t1.key2 = t2.key2 and
t1.key3 = t2.key3 and
datediff(t1.date, t2.date) between -5 and 5
);
我也不是 100% 确定 Hive 会接受这些关联子句。
正如您已经意识到的那样,问题是 where
迫使 t1.date
和 t2.date
存在。您只需要避免这种假设,例如:
(t1.date is null) or (t2.date is null) or (datediff(t1.date, t2.date) between -5 and 5)
这是我根据@Gordon Linoff 在讨论环节中的建议对我自己的问题的解决方案。
create table t3 as
select *, row_number () over () as id from t1;
create table t4 as
select *, row_number () over () as id from t2;
create table t5 as
select
t1.id as id_1
, t1.key1 as key1_1
, t1.key2 as key2_1
, t1.key3 as key3_1
, t1.date as date_1
, t1.v1
, t2.id as id_2
, t2.key1 as key1_2
, t2.key2 as key2_2
, t2.key3 as key3_2
, t2.date as date_2
, t2.v2
from t3 as t1
full outer join t4 as t2
on t1.key1 = t2.key1 and t1.key2 = t2.key2 and t1.key3 = t2.key3
where datediff(t1.date, t2.date) between -5 and 5
;
set hive.mapred.mode=nonstrict;
create table t6 as
select
t1.id as id_1
, t1.key1 as key1_1
, t1.key2 as key2_1
, t1.key3 as key3_1
, t1.date as date_1
, t1.v1
, null as id_2
, null as key1_2
, null as key2_2
, null as key3_2
, null as date_2
, null as v2
from t3 as t1
where t1.id not in (select t2.id_1 from t5 as t2 where t2.id_1 is not null)
;
create table t7 as
select
null as id_1
, null as key1_1
, null as key2_1
, null as key3_1
, null as date_1
, null as v1
, t1.id as id_2
, t1.key1 key1_2
, t1.key2 key2_2
, t1.key3 key3_2
, t1.date date_2
, t1.v2
from t4 as t1
where t1.id not in (select t2.id_2 from t5 as t2 where t2.id_2 is not null)
;
create table t8 as
select * from t5 union all
select * from t6 union all
select * from t7
;