如何从配置单元中的 json_tuple return array<struct>
How to return array<struct> from json_tuple in hive
我有一个带有 json 列的配置单元 table。它是 orc 格式,只有一列有一个 json 字符串。
- json_column
{
"type":"REGULAR",
"period":[
"ONCE_PER_FOUR_WEEK",
"ONCE_PER_SIX_WEEK",
"ONCE_PER_ONE_MONTH",
"ONCE_PER_TWO_MONTH",
"ONCE_PER_THREE_MONTH"
],
"count":[
"4",
"8",
"12"
],
"day":[
"SATURDAY",
"SUNDAY"
],
"content":[
{
"count":"2",
"value":5,
"unit":"PERCENT"
},
{
"count":"3",
"value":10,
"unit":"PERCENT"
}
]
}
我想把这一栏分成五栏。
type string,
period array<string>,
count array<string>,
day array<string>,
content array<struct<count :string, value :int, unit :string>>
首先,我把这个栏目分成了四栏json_tuple
。
SELECT b.type as type,
b.period as period,
b.count as count,
b.deliveryImpossibleDay as day,
b.content as content
FROM sample_table a
LATERAL VIEW JSON_TUPLE(a.content, 'type', 'period', 'count', 'day',
'content') b
AS type, period, count, day, content
我需要将 content
列更改为结构数组,但如果 returns 字符串值。
[{"count":"2","value":5,"unit":"PERCENT"},{"count":"3","value":10,"unit":"PERCENT"}]
如何将它从 string
转换为 array<struct<count :string, value :int, unit :string>>
?
有什么想法吗?
不幸的是 JSON_TUPLE 和 GET_JSON_OBJECT return 字符串。要在不使用自定义 UDF 的情况下转换 JSON 字符串,您可以解析字符串、拆分、分解和重新构建 assemble 结构和数组。
演示:
with sample_table as (
select '{
"type":"REGULAR",
"period":[
"ONCE_PER_FOUR_WEEK",
"ONCE_PER_SIX_WEEK",
"ONCE_PER_ONE_MONTH",
"ONCE_PER_TWO_MONTH",
"ONCE_PER_THREE_MONTH"
],
"count":[
"4",
"8",
"12"
],
"day":[
"SATURDAY",
"SUNDAY"
],
"content":[
{
"count":"2",
"value":5,
"unit":"PERCENT"
},
{
"count":"3",
"value":10,
"unit":"PERCENT"
}
]
}' as content
)
SELECT b.type as type,
--to convert to array<string>
--remove [" and "], split by ","
split(regexp_replace(b.period,'^\["|"\]',''),'","') as period,
split(regexp_replace(b.count,'^\["|"\]',''),'","') as count,
split(regexp_replace(b.day,'^\["|"\]',''),'","') as day,
--convert to struct and collect array of structs
collect_list(named_struct('count', x.count, 'value', int(x.value), 'unit', x.unit)) as content
FROM sample_table a
LATERAL VIEW JSON_TUPLE(a.content, 'type', 'period', 'count', 'day', 'content') b AS type, period, count, day, content
LATERAL VIEW explode(split(regexp_replace(b.content,'^\[|\]$',''), --remove []
'(?<=\}),(?=\{)' --split by comma only after } and before {
)) e as str_struct
LATERAL VIEW JSON_TUPLE(e.str_struct,'count','value', 'unit') x as count, value, unit
group by b.type,
b.period,
b.count,
b.day
结果:
type period count day content
REGULAR ["ONCE_PER_FOUR_WEEK","ONCE_PER_SIX_WEEK","ONCE_PER_ONE_MONTH","ONCE_PER_TWO_MONTH","ONCE_PER_THREE_MONTH"] ["4","8","12"] ["SATURDAY","SUNDAY"] [{"count":"2","value":5,"unit":"PERCENT"},{"count":"3","value":10,"unit":"PERCENT"}]
我有一个带有 json 列的配置单元 table。它是 orc 格式,只有一列有一个 json 字符串。
- json_column
{
"type":"REGULAR",
"period":[
"ONCE_PER_FOUR_WEEK",
"ONCE_PER_SIX_WEEK",
"ONCE_PER_ONE_MONTH",
"ONCE_PER_TWO_MONTH",
"ONCE_PER_THREE_MONTH"
],
"count":[
"4",
"8",
"12"
],
"day":[
"SATURDAY",
"SUNDAY"
],
"content":[
{
"count":"2",
"value":5,
"unit":"PERCENT"
},
{
"count":"3",
"value":10,
"unit":"PERCENT"
}
]
}
我想把这一栏分成五栏。
type string,
period array<string>,
count array<string>,
day array<string>,
content array<struct<count :string, value :int, unit :string>>
首先,我把这个栏目分成了四栏json_tuple
。
SELECT b.type as type,
b.period as period,
b.count as count,
b.deliveryImpossibleDay as day,
b.content as content
FROM sample_table a
LATERAL VIEW JSON_TUPLE(a.content, 'type', 'period', 'count', 'day',
'content') b
AS type, period, count, day, content
我需要将 content
列更改为结构数组,但如果 returns 字符串值。
[{"count":"2","value":5,"unit":"PERCENT"},{"count":"3","value":10,"unit":"PERCENT"}]
如何将它从 string
转换为 array<struct<count :string, value :int, unit :string>>
?
有什么想法吗?
不幸的是 JSON_TUPLE 和 GET_JSON_OBJECT return 字符串。要在不使用自定义 UDF 的情况下转换 JSON 字符串,您可以解析字符串、拆分、分解和重新构建 assemble 结构和数组。
演示:
with sample_table as (
select '{
"type":"REGULAR",
"period":[
"ONCE_PER_FOUR_WEEK",
"ONCE_PER_SIX_WEEK",
"ONCE_PER_ONE_MONTH",
"ONCE_PER_TWO_MONTH",
"ONCE_PER_THREE_MONTH"
],
"count":[
"4",
"8",
"12"
],
"day":[
"SATURDAY",
"SUNDAY"
],
"content":[
{
"count":"2",
"value":5,
"unit":"PERCENT"
},
{
"count":"3",
"value":10,
"unit":"PERCENT"
}
]
}' as content
)
SELECT b.type as type,
--to convert to array<string>
--remove [" and "], split by ","
split(regexp_replace(b.period,'^\["|"\]',''),'","') as period,
split(regexp_replace(b.count,'^\["|"\]',''),'","') as count,
split(regexp_replace(b.day,'^\["|"\]',''),'","') as day,
--convert to struct and collect array of structs
collect_list(named_struct('count', x.count, 'value', int(x.value), 'unit', x.unit)) as content
FROM sample_table a
LATERAL VIEW JSON_TUPLE(a.content, 'type', 'period', 'count', 'day', 'content') b AS type, period, count, day, content
LATERAL VIEW explode(split(regexp_replace(b.content,'^\[|\]$',''), --remove []
'(?<=\}),(?=\{)' --split by comma only after } and before {
)) e as str_struct
LATERAL VIEW JSON_TUPLE(e.str_struct,'count','value', 'unit') x as count, value, unit
group by b.type,
b.period,
b.count,
b.day
结果:
type period count day content
REGULAR ["ONCE_PER_FOUR_WEEK","ONCE_PER_SIX_WEEK","ONCE_PER_ONE_MONTH","ONCE_PER_TWO_MONTH","ONCE_PER_THREE_MONTH"] ["4","8","12"] ["SATURDAY","SUNDAY"] [{"count":"2","value":5,"unit":"PERCENT"},{"count":"3","value":10,"unit":"PERCENT"}]