Athena/Presto : 使用左连接取消嵌套 2 个数组
Athena/Presto : Unnest 2 arrays with left join
所以我有 2 个 Json 数组需要取消嵌套,并根据 json 结构中的键进行连接。
理论上很容易,但是没有'left join unnest'功能,一切都变得一团糟。
通过对结果进行分组,我已经达到了我想要的效果;但我也担心它正在执行 2 个交叉连接,在再次过滤掉它们之前有效地生成了数千个多余的行(在实时环境中)。
因此,我的问题实际上是在寻找一种更有效的策略来执行相同的逻辑。我很清楚我的 Presto 经验和知识还处于起步阶段!
感谢您的指导!
工作原理:
基本逻辑:
'left' 数组中的每一项都有一个 $.id 值。
对于'left'项中的一些项,将有一个匹配的正确项具有$.a.id值
示例:
- 下面的第一个 SQL 和结果显示了设置,如果不是所需的结果。
- 第二组,显示我当前的解决方案。
(1) Cross Join 的原始结果
with cte as (
Select
123 as record_id,
'[ {"id":"01","key1":["val1"]}, {"id":"02","key1":["val2"]}, {"id":"03","key1":["val3"]} ]' as "left",
'[ {"a":{"id":"02","key1":["apples"]}, "b":{"lala":"bananas"}},{"a":{"id":"01","key1":["one"]}, "b":{"lala":"oneone"}} ]' as "right"
)
select
record_id,
l.i as "left",
r.i as "right",
json_extract(l.i, '$.id') as left_id,
json_extract(r.i, '$.a.id') as right_id
from
cte,
unnest(cast (json_parse("left") as array(json))) as l(i), -- left array
unnest(cast (json_parse("right") as array(json))) as r(i) -- right array
输出:
record_id
left
right
left_id
right_id
123
{"id":"01","key1":["val1"]}
{"a":{"id":"02","key1":["apples"]},"b":{"lala":"bananas"}}
"01"
"02"
123
{"id":"01","key1":["val1"]}
{"a":{"id":"01","key1":["one"]},"b":{"lala":"oneone"}}
"01"
"01"
123
{"id":"02","key1":["val2"]}
{"a":{"id":"02","key1":["apples"]},"b":{"lala":"bananas"}}
"02"
"02"
123
{"id":"02","key1":["val2"]}
{"a":{"id":"01","key1":["one"]},"b":{"lala":"oneone"}}
"02"
"01"
123
{"id":"03","key1":["val3"]}
{"a":{"id":"02","key1":["apples"]},"b":{"lala":"bananas"}}
"03"
"02"
123
{"id":"03","key1":["val3"]}
{"a":{"id":"01","key1":["one"]},"b":{"lala":"oneone"}}
"03"
"01"
(2) 当前解
select
record_id,
l.i as "left",
max( if(json_extract(l.i, '$.id') = json_extract(r.i, '$.a.id'),json_format(r.i),null) )as match
from
cte,
unnest(cast (json_parse("left") as array(json))) as l(i), -- left array
unnest(cast (json_parse("right") as array(json))) as r(i) -- right array
group by
record_id,
l.i
record_id
left
match
123
{"id":"01","key1":["val1"]}
{"a":{"id":"01","key1":["one"]},"b":{"lala":"oneone"}}
123
{"id":"02","key1":["val2"]}
{"a":{"id":"02","key1":["apples"]},"b":{"lala":"bananas"}}
123
{"id":"03","key1":["val3"]}
取消 CTE 中的数组嵌套和左连接 CTE,在这种情况下,您将消除交叉连接,但代码有点长:
with cte as (
Select
123 as record_id,
'[ {"id":"01","key1":["val1"]}, {"id":"02","key1":["val2"]}, {"id":"03","key1":["val3"]} ]' as "left",
'[ {"a":{"id":"02","key1":["apples"]}, "b":{"lala":"bananas"}},{"a":{"id":"01","key1":["one"]}, "b":{"lala":"oneone"}} ]' as "right"
),
"left" as (
select
record_id,
l.i as "left",
json_extract(l.i, '$.id') as left_id
from
cte,
unnest(cast (json_parse("left") as array(json))) as l(i) -- left array
),
"right" as (
select
record_id,
r.i as "right",
json_extract(r.i, '$.a.id') as right_id
from
cte,
unnest(cast (json_parse("right") as array(json))) as r(i) -- right array
)
select
l.record_id,
l."left",
r."right",
l.left_id,
r.right_id
from
"left" l left join "right" r on l.record_id=r.record_id and l.left_id=r.right_id
结果:
record_id
left
right
left_id
right_id
123
{"id":"01","key1":["val1"]}
{"a":{"id":"01","key1":["one"]},"b":{"lala":"oneone"}}
"01"
"01"
123
{"id":"02","key1":["val2"]}
{"a":{"id":"02","key1":["apples"]},"b":{"lala":"bananas"}}
"02"
"02"
123
{"id":"03","key1":["val3"]}
\N
"03"
\N
所以我有 2 个 Json 数组需要取消嵌套,并根据 json 结构中的键进行连接。 理论上很容易,但是没有'left join unnest'功能,一切都变得一团糟。
通过对结果进行分组,我已经达到了我想要的效果;但我也担心它正在执行 2 个交叉连接,在再次过滤掉它们之前有效地生成了数千个多余的行(在实时环境中)。
因此,我的问题实际上是在寻找一种更有效的策略来执行相同的逻辑。我很清楚我的 Presto 经验和知识还处于起步阶段!
感谢您的指导!
工作原理:
基本逻辑: 'left' 数组中的每一项都有一个 $.id 值。 对于'left'项中的一些项,将有一个匹配的正确项具有$.a.id值
示例:
- 下面的第一个 SQL 和结果显示了设置,如果不是所需的结果。
- 第二组,显示我当前的解决方案。
(1) Cross Join 的原始结果
with cte as (
Select
123 as record_id,
'[ {"id":"01","key1":["val1"]}, {"id":"02","key1":["val2"]}, {"id":"03","key1":["val3"]} ]' as "left",
'[ {"a":{"id":"02","key1":["apples"]}, "b":{"lala":"bananas"}},{"a":{"id":"01","key1":["one"]}, "b":{"lala":"oneone"}} ]' as "right"
)
select
record_id,
l.i as "left",
r.i as "right",
json_extract(l.i, '$.id') as left_id,
json_extract(r.i, '$.a.id') as right_id
from
cte,
unnest(cast (json_parse("left") as array(json))) as l(i), -- left array
unnest(cast (json_parse("right") as array(json))) as r(i) -- right array
输出:
record_id | left | right | left_id | right_id |
---|---|---|---|---|
123 | {"id":"01","key1":["val1"]} | {"a":{"id":"02","key1":["apples"]},"b":{"lala":"bananas"}} | "01" | "02" |
123 | {"id":"01","key1":["val1"]} | {"a":{"id":"01","key1":["one"]},"b":{"lala":"oneone"}} | "01" | "01" |
123 | {"id":"02","key1":["val2"]} | {"a":{"id":"02","key1":["apples"]},"b":{"lala":"bananas"}} | "02" | "02" |
123 | {"id":"02","key1":["val2"]} | {"a":{"id":"01","key1":["one"]},"b":{"lala":"oneone"}} | "02" | "01" |
123 | {"id":"03","key1":["val3"]} | {"a":{"id":"02","key1":["apples"]},"b":{"lala":"bananas"}} | "03" | "02" |
123 | {"id":"03","key1":["val3"]} | {"a":{"id":"01","key1":["one"]},"b":{"lala":"oneone"}} | "03" | "01" |
(2) 当前解
select
record_id,
l.i as "left",
max( if(json_extract(l.i, '$.id') = json_extract(r.i, '$.a.id'),json_format(r.i),null) )as match
from
cte,
unnest(cast (json_parse("left") as array(json))) as l(i), -- left array
unnest(cast (json_parse("right") as array(json))) as r(i) -- right array
group by
record_id,
l.i
record_id | left | match |
---|---|---|
123 | {"id":"01","key1":["val1"]} | {"a":{"id":"01","key1":["one"]},"b":{"lala":"oneone"}} |
123 | {"id":"02","key1":["val2"]} | {"a":{"id":"02","key1":["apples"]},"b":{"lala":"bananas"}} |
123 | {"id":"03","key1":["val3"]} |
取消 CTE 中的数组嵌套和左连接 CTE,在这种情况下,您将消除交叉连接,但代码有点长:
with cte as (
Select
123 as record_id,
'[ {"id":"01","key1":["val1"]}, {"id":"02","key1":["val2"]}, {"id":"03","key1":["val3"]} ]' as "left",
'[ {"a":{"id":"02","key1":["apples"]}, "b":{"lala":"bananas"}},{"a":{"id":"01","key1":["one"]}, "b":{"lala":"oneone"}} ]' as "right"
),
"left" as (
select
record_id,
l.i as "left",
json_extract(l.i, '$.id') as left_id
from
cte,
unnest(cast (json_parse("left") as array(json))) as l(i) -- left array
),
"right" as (
select
record_id,
r.i as "right",
json_extract(r.i, '$.a.id') as right_id
from
cte,
unnest(cast (json_parse("right") as array(json))) as r(i) -- right array
)
select
l.record_id,
l."left",
r."right",
l.left_id,
r.right_id
from
"left" l left join "right" r on l.record_id=r.record_id and l.left_id=r.right_id
结果:
record_id | left | right | left_id | right_id |
---|---|---|---|---|
123 | {"id":"01","key1":["val1"]} | {"a":{"id":"01","key1":["one"]},"b":{"lala":"oneone"}} | "01" | "01" |
123 | {"id":"02","key1":["val2"]} | {"a":{"id":"02","key1":["apples"]},"b":{"lala":"bananas"}} | "02" | "02" |
123 | {"id":"03","key1":["val3"]} | \N | "03" | \N |