如何获取组数据并将结果收集到 Hive 中的地图中?
How can I get group data and collect the result into a map in Hive?
我有一个 table 这样的:
id | job | school |
1 | programmer | school1 |
2 | programmer | school1 |
3 | programmer | school2 |
4 | pm | school3 |
5 | pm | school2 |
6 | pm | school3 |
我想做以下事情:
- 按工作分组
- 获取学校列表并统计,像这样[(school1, 2), (school2, 1)]
- 学校列表按计数排序,因此不能是 [(school1, 1), (school1, 2)]
例子的结果是:
programmer | [(school1, 2), (school2, 1)]
pm | [(school3, 2), (school2, 1)]
我们不能在配置单元中的集合 (collect_set) 中使用 Map(因为 collect_set 中只允许原始数据类型)。
这 2 个查询将提供您要查找的内容(除了一个涉及 map 其他不涉及之外,两者都是相同的)
CREATE EXTERNAL TABLE job_test(
id string,
job string,
school string )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/user/test/job.txt';
SELECT b.job, collect_set(concat_ws(':',map_keys(b.school_map),map_values(b.school_map))) as school_cnt
FROM
(
SELECT a.job, map(a.school,a.cnt) as school_map
FROM
(
SELECT job,
school,
cast(count(1) as string) as cnt
FROM job_test
GROUP BY
job,
school
)a
)b
GROUP BY b.job;
SELECT a.job, collect_set(concat_ws(':',a.school,a.cnt)) as school_cnt
FROM
(
SELECT job,
school,
cast(count(1) as string) as cnt
FROM job_test
GROUP BY
job,
school
)a
GROUP BY a.job;
希望对您有所帮助:)
只需添加 Brickhouse jar 并创建一个 collect()
函数
add jar ./brickhouse-0.7.1.jar;
create temporary function collect as 'brickhouse.udf.collect.CollectUDAF';
select job
, collect(school, c) school_count_map
from (
select *
from (
select job, school
, count( * ) c
from table
group by job, school ) x
order by job, c desc) y
group by job
我有一个 table 这样的:
id | job | school |
1 | programmer | school1 |
2 | programmer | school1 |
3 | programmer | school2 |
4 | pm | school3 |
5 | pm | school2 |
6 | pm | school3 |
我想做以下事情:
- 按工作分组
- 获取学校列表并统计,像这样[(school1, 2), (school2, 1)]
- 学校列表按计数排序,因此不能是 [(school1, 1), (school1, 2)]
例子的结果是:
programmer | [(school1, 2), (school2, 1)]
pm | [(school3, 2), (school2, 1)]
我们不能在配置单元中的集合 (collect_set) 中使用 Map(因为 collect_set 中只允许原始数据类型)。
这 2 个查询将提供您要查找的内容(除了一个涉及 map 其他不涉及之外,两者都是相同的)
CREATE EXTERNAL TABLE job_test(
id string,
job string,
school string )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/user/test/job.txt';
SELECT b.job, collect_set(concat_ws(':',map_keys(b.school_map),map_values(b.school_map))) as school_cnt
FROM
(
SELECT a.job, map(a.school,a.cnt) as school_map
FROM
(
SELECT job,
school,
cast(count(1) as string) as cnt
FROM job_test
GROUP BY
job,
school
)a
)b
GROUP BY b.job;
SELECT a.job, collect_set(concat_ws(':',a.school,a.cnt)) as school_cnt
FROM
(
SELECT job,
school,
cast(count(1) as string) as cnt
FROM job_test
GROUP BY
job,
school
)a
GROUP BY a.job;
希望对您有所帮助:)
只需添加 Brickhouse jar 并创建一个 collect()
函数
add jar ./brickhouse-0.7.1.jar;
create temporary function collect as 'brickhouse.udf.collect.CollectUDAF';
select job
, collect(school, c) school_count_map
from (
select *
from (
select job, school
, count( * ) c
from table
group by job, school ) x
order by job, c desc) y
group by job