Hive:是否有更好的方法对列进行百分位排名?
Hive: Is there a better way to percentile rank a column?
目前,要对 hive 中的列进行百分位排名,我使用的是类似以下内容的方法。我正在尝试按项目所属的百分位数对列中的项目进行排名,为每个项目分配一个 0 到 1 的值。下面的代码分配了一个从 0 到 9 的值,本质上是说 char_percentile_rank
为 0 的项目在项目的后 10% 中,值为 9 的项目在项目的前 10% 中。有更好的方法吗?
select item
, characteristic
, case when characteristic <= char_perc[0] then 0
when characteristic <= char_perc[1] then 1
when characteristic <= char_perc[2] then 2
when characteristic <= char_perc[3] then 3
when characteristic <= char_perc[4] then 4
when characteristic <= char_perc[5] then 5
when characteristic <= char_perc[6] then 6
when characteristic <= char_perc[7] then 7
when characteristic <= char_perc[8] then 8
else 9
end as char_percentile_rank
from (
select split(item_id,'-')[0] as item
, split(item_id,'-')[1] as characteristic
, char_perc
from (
select collect_set(concat_ws('-',item,characteristic)) as item_set
, PERCENTILE(BIGINT(characteristic),array(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) as char_perc
from(
select item
, sum(characteristic) as characteristic
from table
group by item
) t1
) t2
lateral view explode(item_set) explodetable as item_id
) t3
注意:我必须执行 collect_set
以避免自连接,因为百分位数函数隐式执行 group by
.
我发现百分位数函数非常慢(至少在这种用法中)。也许手动计算百分位数会更好?
尝试删除您的一个派生表
select item
, characteristic
, case when characteristic <= char_perc[0] then 0
when characteristic <= char_perc[1] then 1
when characteristic <= char_perc[2] then 2
when characteristic <= char_perc[3] then 3
when characteristic <= char_perc[4] then 4
when characteristic <= char_perc[5] then 5
when characteristic <= char_perc[6] then 6
when characteristic <= char_perc[7] then 7
when characteristic <= char_perc[8] then 8
else 9
end as char_percentile_rank
from (
select item, characteristic,
, PERCENTILE(BIGINT(characteristic),array(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) over () as char_perc
from (
select item
, sum(characteristic) as characteristic
from table
group by item
) t1
) t2
目前,要对 hive 中的列进行百分位排名,我使用的是类似以下内容的方法。我正在尝试按项目所属的百分位数对列中的项目进行排名,为每个项目分配一个 0 到 1 的值。下面的代码分配了一个从 0 到 9 的值,本质上是说 char_percentile_rank
为 0 的项目在项目的后 10% 中,值为 9 的项目在项目的前 10% 中。有更好的方法吗?
select item
, characteristic
, case when characteristic <= char_perc[0] then 0
when characteristic <= char_perc[1] then 1
when characteristic <= char_perc[2] then 2
when characteristic <= char_perc[3] then 3
when characteristic <= char_perc[4] then 4
when characteristic <= char_perc[5] then 5
when characteristic <= char_perc[6] then 6
when characteristic <= char_perc[7] then 7
when characteristic <= char_perc[8] then 8
else 9
end as char_percentile_rank
from (
select split(item_id,'-')[0] as item
, split(item_id,'-')[1] as characteristic
, char_perc
from (
select collect_set(concat_ws('-',item,characteristic)) as item_set
, PERCENTILE(BIGINT(characteristic),array(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) as char_perc
from(
select item
, sum(characteristic) as characteristic
from table
group by item
) t1
) t2
lateral view explode(item_set) explodetable as item_id
) t3
注意:我必须执行 collect_set
以避免自连接,因为百分位数函数隐式执行 group by
.
我发现百分位数函数非常慢(至少在这种用法中)。也许手动计算百分位数会更好?
尝试删除您的一个派生表
select item
, characteristic
, case when characteristic <= char_perc[0] then 0
when characteristic <= char_perc[1] then 1
when characteristic <= char_perc[2] then 2
when characteristic <= char_perc[3] then 3
when characteristic <= char_perc[4] then 4
when characteristic <= char_perc[5] then 5
when characteristic <= char_perc[6] then 6
when characteristic <= char_perc[7] then 7
when characteristic <= char_perc[8] then 8
else 9
end as char_percentile_rank
from (
select item, characteristic,
, PERCENTILE(BIGINT(characteristic),array(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) over () as char_perc
from (
select item
, sum(characteristic) as characteristic
from table
group by item
) t1
) t2