Hive:按属性将值聚合到 JSON 或 MAP 字段中
Hive: Aggregate values by attribute into a JSON or MAP field
我有一个 table 看起来像这样:
| user | attribute | value |
|--------|-------------|---------|
| 1 | A | 10 |
| 1 | A | 20 |
| 1 | B | 5 |
| 2 | B | 10 |
| 2 | B | 15 |
| 2 | C | 100 |
| 2 | C | 200 |
我想将此 table 按 user
分组,并将 value
字段的总和收集到 JSON 或以属性为键的 MAP,喜欢:
| user | sum_values_by_attribute |
|------|--------------------------|
| 1 | {"A": 30, "B": 15} |
| 2 | {"B": 25, "C": 300} |
有没有办法在 Hive 中做到这一点?
我发现了相关问题,例如 and 但 none 考虑对值求和的情况。
可以先按属性求和,user_id再用collect list。
请让我知道以下输出是否正常。
SQL下面-
select `user`,
collect_list(concat(att,":",cast(val as string))) sum_values_by_attribute
from
(select `user`,`attribute` att, sum(`value`) val from tmp2 group by u,att) tmp2
group by `user`;
测试查询 -
create table tmp2 ( `user` int, `attribute` string, `value` int);
insert into tmp2 select 1,'A',40;
insert into tmp2 select 1,'A',20;
insert into tmp2 select 1,'B',5;
insert into tmp2 select 2,'C',20;
insert into tmp2 select 1,'B',10;
insert into tmp2 select 2,'B',10;
insert into tmp2 select 2,'C',10;
select `user`,
collect_list(concat(att,":",cast(val as string))) sum_values_by_attribute
from
(select `user`,`attribute` att, sum(`value`) val from tmp2 group by u,att) tmp2
group by `user`;
JSON 对应于 map<string, int>
的字符串只能使用本机函数在 Hive 中构建:按用户、属性聚合,然后连接对“key” : 它们的值和聚合数组,使用 concat_ws 连接数组,添加花括号。
演示:
with initial_data as (
select stack(7,
1,'A',40,
1,'A',20,
1,'B',5,
2,'B',10,
2,'B',15,
2,'C',100,
2,'C',200) as (`user`, attribute, value )
)
select `user`, concat('{',concat_ws(',',collect_set(concat('"', attribute, '": ',sum_value))), '}') as sum_values_by_attribute
from
(--aggregate groupby user, attribute
select `user`, attribute, sum(value) as sum_value from initial_data group by `user`, attribute
)s
group by `user`;
结果(JSON 字符串):
user sum_values_by_attribute
1 {"A": 60,"B": 5}
2 {"B": 25,"C": 300}
注意:如果你在Spark上运行这个,你可以cast( as map<string, int>)
,Hive不支持复杂类型的cast.
另外 map<string, string>
可以仅使用本机函数轻松完成:相同的键值对数组 byt 没有双引号(如 A:10 ) 使用 concat_ws
连接到逗号分隔的字符串并使用 str_to_map
函数转换为映射(跳过相同的 WITH CTE):
select `user`, str_to_map(concat_ws(',',collect_set(concat(attribute, ':',sum_value)))) as sum_values_by_attribute
from
(--aggregate groupby user, attribute
select `user`, attribute, sum(value) as sum_value from initial_data group by `user`, attribute
)s
group by `user`;
结果 ( map ):
user sum_values_by_attribute
1 {"A":"60","B":"5"}
2 {"B":"25","C":"300"}
如果您需要 map<string, int>
,不幸的是,它不能仅使用 Hive 本机函数来完成,因为 map_to_str
returns map<string, string>
,而不是 map<string, int>
.您可以尝试 brickhouse 收集功能:
add jar '~/brickhouse/target/brickhouse-0.6.0.jar'; --check brickhouse site https://github.com/klout/brickhouse for instructions
create temporary function collect as 'brickhouse.udf.collect.CollectUDAF';
select `user`, collect(attribute, sum_value) as sum_values_by_attribute
from
(--aggregate groupby user, attribute
select `user`, attribute, sum(value) as sum_value from initial_data group by `user`, attribute
)s
group by `user`;
我有一个 table 看起来像这样:
| user | attribute | value |
|--------|-------------|---------|
| 1 | A | 10 |
| 1 | A | 20 |
| 1 | B | 5 |
| 2 | B | 10 |
| 2 | B | 15 |
| 2 | C | 100 |
| 2 | C | 200 |
我想将此 table 按 user
分组,并将 value
字段的总和收集到 JSON 或以属性为键的 MAP,喜欢:
| user | sum_values_by_attribute |
|------|--------------------------|
| 1 | {"A": 30, "B": 15} |
| 2 | {"B": 25, "C": 300} |
有没有办法在 Hive 中做到这一点?
我发现了相关问题,例如
可以先按属性求和,user_id再用collect list。
请让我知道以下输出是否正常。
SQL下面-
select `user`,
collect_list(concat(att,":",cast(val as string))) sum_values_by_attribute
from
(select `user`,`attribute` att, sum(`value`) val from tmp2 group by u,att) tmp2
group by `user`;
测试查询 -
create table tmp2 ( `user` int, `attribute` string, `value` int);
insert into tmp2 select 1,'A',40;
insert into tmp2 select 1,'A',20;
insert into tmp2 select 1,'B',5;
insert into tmp2 select 2,'C',20;
insert into tmp2 select 1,'B',10;
insert into tmp2 select 2,'B',10;
insert into tmp2 select 2,'C',10;
select `user`,
collect_list(concat(att,":",cast(val as string))) sum_values_by_attribute
from
(select `user`,`attribute` att, sum(`value`) val from tmp2 group by u,att) tmp2
group by `user`;
JSON 对应于 map<string, int>
的字符串只能使用本机函数在 Hive 中构建:按用户、属性聚合,然后连接对“key” : 它们的值和聚合数组,使用 concat_ws 连接数组,添加花括号。
演示:
with initial_data as (
select stack(7,
1,'A',40,
1,'A',20,
1,'B',5,
2,'B',10,
2,'B',15,
2,'C',100,
2,'C',200) as (`user`, attribute, value )
)
select `user`, concat('{',concat_ws(',',collect_set(concat('"', attribute, '": ',sum_value))), '}') as sum_values_by_attribute
from
(--aggregate groupby user, attribute
select `user`, attribute, sum(value) as sum_value from initial_data group by `user`, attribute
)s
group by `user`;
结果(JSON 字符串):
user sum_values_by_attribute
1 {"A": 60,"B": 5}
2 {"B": 25,"C": 300}
注意:如果你在Spark上运行这个,你可以cast( as map<string, int>)
,Hive不支持复杂类型的cast.
另外 map<string, string>
可以仅使用本机函数轻松完成:相同的键值对数组 byt 没有双引号(如 A:10 ) 使用 concat_ws
连接到逗号分隔的字符串并使用 str_to_map
函数转换为映射(跳过相同的 WITH CTE):
select `user`, str_to_map(concat_ws(',',collect_set(concat(attribute, ':',sum_value)))) as sum_values_by_attribute
from
(--aggregate groupby user, attribute
select `user`, attribute, sum(value) as sum_value from initial_data group by `user`, attribute
)s
group by `user`;
结果 ( map
user sum_values_by_attribute
1 {"A":"60","B":"5"}
2 {"B":"25","C":"300"}
如果您需要 map<string, int>
,不幸的是,它不能仅使用 Hive 本机函数来完成,因为 map_to_str
returns map<string, string>
,而不是 map<string, int>
.您可以尝试 brickhouse 收集功能:
add jar '~/brickhouse/target/brickhouse-0.6.0.jar'; --check brickhouse site https://github.com/klout/brickhouse for instructions
create temporary function collect as 'brickhouse.udf.collect.CollectUDAF';
select `user`, collect(attribute, sum_value) as sum_values_by_attribute
from
(--aggregate groupby user, attribute
select `user`, attribute, sum(value) as sum_value from initial_data group by `user`, attribute
)s
group by `user`;