如何在 pig 或 hive 中使用 array_agg() 聚合函数
How to use array_agg() aggregate function in pig or hive
我有以下数据:
================================================================
session_id screen_name screen_launch_time
================================================================
990004916946605-1404157897784 screen1 1404157898275
990004916946605-1404157897784 screen2 1404157898337
990004947764274-1435162269418 screen1 1435162274044
990004947764274-1435162269418 screen3 1435162274081
我想使用 array_agg
函数获取以下格式的数据:
=========================================================
session_id screen_flow count
=========================================================
990004916946605-1404157897784 screen1->screen2 1
990004947764274-1435162269418 screen1->screen3 1
有没有人尝试编写 UDAF
或 python
脚本来实现 array_agg
函数中使用的逻辑?
请分享您的想法。
只需按 session_id
分组,连接 screen_name
,然后计算每组的记录数。如果你不想构建 brickhouse jar,你可以使用 collect_list()
而不是 collect()
(但我不推荐它)。
查询:
add jar /path/to/jars/brickhouse-0.7.1.jar;
create temporary function collect as "brickhouse.udf.collect.CollectUDAF";
select session_id, screen_flow
, count(*) count
from (
select session_id
, concat_ws('->', collect(screen_name)) screen_flow
from db.table
group by session_id ) x
group by session_id, screen_flow
输出:
990004916946605-1404157897784 screen1->screen2 1
990004947764274-1435162269418 screen1->screen3 1
输入:-
990004916946605-1404157897784,screen1,1404157898275
990004916946605-1404157897784,screen2,1404157898337
990004947764274-1435162269418,screen1,1435162274044
990004947764274-1435162269418,screen3,1435162274081
以下是猪型回答..
records = LOAD '/user/user/inputfiles/session_id.txt' USING PigStorage(',') AS (session_id:chararray,screen_name:chararray,screnn_launch_time:chararray);
rec_grped = GROUP records BY session_id;
rec_each = FOREACH rec_grped
{
rec_inner_each = FOREACH records GENERATE screen_name;
GENERATE group as session_id, REPLACE(BagToString(rec_inner_each),'_','-->') as screen_flow, 1 as cnt;
};
dump rec_each;
输出:-
990004916946605-1404157897784 screen1-->screen2 1
990004947764274-1435162269418 screen1-->screen3 1
我有以下数据:
================================================================
session_id screen_name screen_launch_time
================================================================
990004916946605-1404157897784 screen1 1404157898275
990004916946605-1404157897784 screen2 1404157898337
990004947764274-1435162269418 screen1 1435162274044
990004947764274-1435162269418 screen3 1435162274081
我想使用 array_agg
函数获取以下格式的数据:
=========================================================
session_id screen_flow count
=========================================================
990004916946605-1404157897784 screen1->screen2 1
990004947764274-1435162269418 screen1->screen3 1
有没有人尝试编写 UDAF
或 python
脚本来实现 array_agg
函数中使用的逻辑?
请分享您的想法。
只需按 session_id
分组,连接 screen_name
,然后计算每组的记录数。如果你不想构建 brickhouse jar,你可以使用 collect_list()
而不是 collect()
(但我不推荐它)。
查询:
add jar /path/to/jars/brickhouse-0.7.1.jar;
create temporary function collect as "brickhouse.udf.collect.CollectUDAF";
select session_id, screen_flow
, count(*) count
from (
select session_id
, concat_ws('->', collect(screen_name)) screen_flow
from db.table
group by session_id ) x
group by session_id, screen_flow
输出:
990004916946605-1404157897784 screen1->screen2 1
990004947764274-1435162269418 screen1->screen3 1
输入:-
990004916946605-1404157897784,screen1,1404157898275
990004916946605-1404157897784,screen2,1404157898337
990004947764274-1435162269418,screen1,1435162274044
990004947764274-1435162269418,screen3,1435162274081
以下是猪型回答..
records = LOAD '/user/user/inputfiles/session_id.txt' USING PigStorage(',') AS (session_id:chararray,screen_name:chararray,screnn_launch_time:chararray);
rec_grped = GROUP records BY session_id;
rec_each = FOREACH rec_grped
{
rec_inner_each = FOREACH records GENERATE screen_name;
GENERATE group as session_id, REPLACE(BagToString(rec_inner_each),'_','-->') as screen_flow, 1 as cnt;
};
dump rec_each;
输出:-
990004916946605-1404157897784 screen1-->screen2 1
990004947764274-1435162269418 screen1-->screen3 1