SQL:选择top proportion的列和其他列对应的属性
SQL: Selecting top proportion of column and corresponding attributes in other columns
假设我有这样的数据
group, city, user,
nyc_frisbee, nyc, joe,
nyc_frisbee, nyc, ben,
nyc_frisbee, sf, tim,
sf_cooking, sf, tim,
sf_cooking, atl, jon,
我的目标是
group, top_city, prob
nyc_firsbee, nyc, .66
sf_cooking, sf, .5
sf_ccoking, atl, .5
边缘情况:在下面这个 table 中,nyc_frisbee 只出现一次,因为只有一个排名靠前的城市。但是,sf_cooking 出现了两次,因为 sf 和 atl 是并列的。
如何在 SQL 中完成此操作?
我目前拥有的:
SELECT group, city, top/total AS prob
FROM (
SELECT
group,
city,
freq,
MAX(freq) OVER(PARTITION BY group) AS top,
SUM(freq) OVER(PARTITION BY group) AS total
FROM
(SELECT group, city, COUNT(city) AS freq
FROM mytable
GROUP BY group, city) inner_query
) outer_query
WHERE outer.freq = outer.top
这不符合我的预期...
group, top_city, prob
nyc_firsbee, nyc, .66
nyc_firsbee, sf, .66
sf_cooking, sf, .5
sf_ccoking, atl, .5
所以我为每个具有相同概率值的唯一 group/city 组合得到一行。
怎么样:
WITH x as (
SELECT group, city, CAST(COUNT(*) AS FLOAT)/SUM(COUNT(*)) OVER(PARTITION BY group) AS prob
FROM mytable
GROUP BY group, city
)
SELECT x.*
FROM
x
INNER JOIN
(SELECT group, MAX(prob) maxprob FROM x GROUP BY group) y
ON
x.group = y.group AND
x.prob = y.maxprob
我们将 group/cities 归结为
group, city, prob,
nyc_frisbee, nyc, 0.66,
nyc_frisbee, sf, 0.33,
sf_cooking, sf, 0.5,
sf_cooking, atl, 0.5,
并别名为 X,然后我们将其加入到 group, max(prob)
上的一组自身中。因为 sf_cooking, 0.5
是最大值,join 匹配两次,保持平局,但是 nyc 的最大值是 0.66,匹配一次,不包括 0.33
看看下面的方法是否适合你。先计算概率,然后按概率对组进行排序,最后得到不同的行。
with x as (
select *,
Count(*) over(partition by group, City) * 1.0 / Count(*) over(partition by group) prob
from t
), r as (
select *, dense_rank() over(partition by group order by prob desc) rn
from x
)
select distinct group, city, prob
from r
where rn=1
假设我有这样的数据
group, city, user,
nyc_frisbee, nyc, joe,
nyc_frisbee, nyc, ben,
nyc_frisbee, sf, tim,
sf_cooking, sf, tim,
sf_cooking, atl, jon,
我的目标是
group, top_city, prob
nyc_firsbee, nyc, .66
sf_cooking, sf, .5
sf_ccoking, atl, .5
边缘情况:在下面这个 table 中,nyc_frisbee 只出现一次,因为只有一个排名靠前的城市。但是,sf_cooking 出现了两次,因为 sf 和 atl 是并列的。
如何在 SQL 中完成此操作?
我目前拥有的:
SELECT group, city, top/total AS prob
FROM (
SELECT
group,
city,
freq,
MAX(freq) OVER(PARTITION BY group) AS top,
SUM(freq) OVER(PARTITION BY group) AS total
FROM
(SELECT group, city, COUNT(city) AS freq
FROM mytable
GROUP BY group, city) inner_query
) outer_query
WHERE outer.freq = outer.top
这不符合我的预期...
group, top_city, prob
nyc_firsbee, nyc, .66
nyc_firsbee, sf, .66
sf_cooking, sf, .5
sf_ccoking, atl, .5
所以我为每个具有相同概率值的唯一 group/city 组合得到一行。
怎么样:
WITH x as (
SELECT group, city, CAST(COUNT(*) AS FLOAT)/SUM(COUNT(*)) OVER(PARTITION BY group) AS prob
FROM mytable
GROUP BY group, city
)
SELECT x.*
FROM
x
INNER JOIN
(SELECT group, MAX(prob) maxprob FROM x GROUP BY group) y
ON
x.group = y.group AND
x.prob = y.maxprob
我们将 group/cities 归结为
group, city, prob,
nyc_frisbee, nyc, 0.66,
nyc_frisbee, sf, 0.33,
sf_cooking, sf, 0.5,
sf_cooking, atl, 0.5,
并别名为 X,然后我们将其加入到 group, max(prob)
上的一组自身中。因为 sf_cooking, 0.5
是最大值,join 匹配两次,保持平局,但是 nyc 的最大值是 0.66,匹配一次,不包括 0.33
看看下面的方法是否适合你。先计算概率,然后按概率对组进行排序,最后得到不同的行。
with x as (
select *,
Count(*) over(partition by group, City) * 1.0 / Count(*) over(partition by group) prob
from t
), r as (
select *, dense_rank() over(partition by group order by prob desc) rn
from x
)
select distinct group, city, prob
from r
where rn=1