Select JSON 数组中元素的前 3 名
Select top 3 by element in JSON arrays
源数据为
user_id video_interest
1 [{"category":"a","score":1},{"category":"b","score":2},{"category":"c","score":3},{"category":"d","score":4}]
2 [{"category":"e","score":1},{"category":"f","score":2},{"category":"g","score":-3}]
输出为
user_id video_interest_top3
1 [{"category":"d","score":4},{"category":"c","score":3},{"category":"b","score":2}]
2 [{"category":"f","score":2},{"category":"e","score":1}]
我需要过滤score>0,然后select每个user_id的top3video_interest按照score
降序排列
分解JSON数组,提取分数,计算每个用户的最大分数(如果需要按分数desc排序最终输出)和row_number按分数过滤前3名,再次收集数组并连接如有必要,将其添加到 STRING。请参阅代码中的注释。我添加了排序数组和整个输出,因为最初不清楚到底应该排序什么:数组或最终输出,如果不需要,请删除 max_score 排序。
演示:
with mytable as (
select stack(2,
1,'[{"category":"a","score":1},{"category":"b","score":2},{"category":"c","score":3},{"category":"d","score":4}]',
2,'[{"category":"e","score":1},{"category":"f","score":2},{"category":"g","score":-3}]'
) as (user_id,video_interest)
)
select --collect array and convert to JSON string
user_id, max_score, concat('[',concat_ws(',',collect_list(category_score)),']') as video_interest
from
(
select user_id, category_score, max_score, score
from
(
select --extract score, filter and sort
user_id, vi.category_score, get_json_object(vi.category_score, '$.score') as score,
row_number() over(partition by user_id order by get_json_object(vi.category_score, '$.score') desc) rn,
max(get_json_object(vi.category_score, '$.score')) over (partition by user_id) max_score
from
(--prepare for exploding array
select user_id, regexp_replace(regexp_replace(video_interest,'\[|\]',''), --remove []
'\},\{', '},,,{') as video_interest --replace , between array elements with ,,, to split
from mytable
)s
--split and explode
lateral view outer explode(split(video_interest,',,,')) vi as category_score
where get_json_object(vi.category_score, '$.score')>0
)s
where rn<=3 --filter top 3
distribute by user_id sort by score desc --Sort collection, remove if not necessary
)s
group by user_id, max_score
order by max_score desc --Sorting users by max_score desc, remove if not necessary
结果:
user_id max_score video_interest
1 4 [{"category":"d","score":4},{"category":"c","score":3},{"category":"b","score":2}]
2 2 [{"category":"f","score":2},{"category":"e","score":1}]
首先,我展开 video_interest 并将其类别和评分设为单个字段。
其次,我使用 row_number() 函数按分数(降序)按 user_id 顺序进行分区,然后使每一行都标有它们在组中的顺序并过滤 order<=3
最后,我使用 collect_list() 将它们简单地收集为一个列表,因为它们在使用 row_number
时是有序的
select user_id,
collect_list(pos) as first_video_interest_top3
from (
select user_id,
category,
score,
pos,
row_number() over(
partition by
user_id
order by
score desc
) rNum
from (
select user_id,
pos.category,
pos.score,
pos
from myData
lateral view explode(video_interest) e as pos
) t1
where score > 0
) t2
where rNum <= 3
group by
user_id
源数据为
user_id video_interest
1 [{"category":"a","score":1},{"category":"b","score":2},{"category":"c","score":3},{"category":"d","score":4}]
2 [{"category":"e","score":1},{"category":"f","score":2},{"category":"g","score":-3}]
输出为
user_id video_interest_top3
1 [{"category":"d","score":4},{"category":"c","score":3},{"category":"b","score":2}]
2 [{"category":"f","score":2},{"category":"e","score":1}]
我需要过滤score>0,然后select每个user_id的top3video_interest按照score
降序排列分解JSON数组,提取分数,计算每个用户的最大分数(如果需要按分数desc排序最终输出)和row_number按分数过滤前3名,再次收集数组并连接如有必要,将其添加到 STRING。请参阅代码中的注释。我添加了排序数组和整个输出,因为最初不清楚到底应该排序什么:数组或最终输出,如果不需要,请删除 max_score 排序。
演示:
with mytable as (
select stack(2,
1,'[{"category":"a","score":1},{"category":"b","score":2},{"category":"c","score":3},{"category":"d","score":4}]',
2,'[{"category":"e","score":1},{"category":"f","score":2},{"category":"g","score":-3}]'
) as (user_id,video_interest)
)
select --collect array and convert to JSON string
user_id, max_score, concat('[',concat_ws(',',collect_list(category_score)),']') as video_interest
from
(
select user_id, category_score, max_score, score
from
(
select --extract score, filter and sort
user_id, vi.category_score, get_json_object(vi.category_score, '$.score') as score,
row_number() over(partition by user_id order by get_json_object(vi.category_score, '$.score') desc) rn,
max(get_json_object(vi.category_score, '$.score')) over (partition by user_id) max_score
from
(--prepare for exploding array
select user_id, regexp_replace(regexp_replace(video_interest,'\[|\]',''), --remove []
'\},\{', '},,,{') as video_interest --replace , between array elements with ,,, to split
from mytable
)s
--split and explode
lateral view outer explode(split(video_interest,',,,')) vi as category_score
where get_json_object(vi.category_score, '$.score')>0
)s
where rn<=3 --filter top 3
distribute by user_id sort by score desc --Sort collection, remove if not necessary
)s
group by user_id, max_score
order by max_score desc --Sorting users by max_score desc, remove if not necessary
结果:
user_id max_score video_interest
1 4 [{"category":"d","score":4},{"category":"c","score":3},{"category":"b","score":2}]
2 2 [{"category":"f","score":2},{"category":"e","score":1}]
首先,我展开 video_interest 并将其类别和评分设为单个字段。 其次,我使用 row_number() 函数按分数(降序)按 user_id 顺序进行分区,然后使每一行都标有它们在组中的顺序并过滤 order<=3 最后,我使用 collect_list() 将它们简单地收集为一个列表,因为它们在使用 row_number
时是有序的select user_id,
collect_list(pos) as first_video_interest_top3
from (
select user_id,
category,
score,
pos,
row_number() over(
partition by
user_id
order by
score desc
) rNum
from (
select user_id,
pos.category,
pos.score,
pos
from myData
lateral view explode(video_interest) e as pos
) t1
where score > 0
) t2
where rNum <= 3
group by
user_id