Postgres 按年龄组确定前 10 个域(排名 + 分组依据)
Postgres determine top 10 domains by age group (rank + group by)
给定包含电子邮件地址的 user_table
,我们需要按年龄段划分的 'top 10' 域列表。
所以对于每个组,我应该获得前 10 名的排名。 (即 50 行)。
到目前为止我有什么(我正在使用 Postgres)。
这似乎很接近,但我认为并列排名正在被蚕食。我不会返回 50 行。我得到 12,这基本上似乎排名 1-10,有 2 个平局。都是同龄人。如果我将它增加到 r<=30,那么我会得到更多结果 (71),包括不同年龄组,但每组超过 10 个(每组 10-15 个)。
with users as (
select a.*,
extract(year from age(dob)) age,
substr(email, position('@' in email)+1, 1000) domain
from user_table a
),
useragegroup as (
select a.*,
case when age between 0 and 18 then '0-18'
when age between 19 and 29 then '19-29'
when age between 30 and 49 then '30-49'
when age between 50 and 65 then '50-65'
else '66-up'
end agegroup
from users a
),
rank as (
select agegroup, domain,
dense_rank() over (order by count(*) desc) r
from useragegroup a
group by agegroup, domain
)
select a.*
from rank a
where r<=10;
为了生成一些测试日期,我有:
(每组将日期更改 10 年)
insert into user_table (
first, last, email, dob
)
select
left(md5(i::text), 3),
left(md5(random()::text), 3),
'user_' || i || '@' || (
CASE (RANDOM() * 14)::INT
WHEN 0 THEN 'gmail'
WHEN 1 THEN 'hotmail'
WHEN 2 THEN 'apple'
WHEN 3 THEN 'icloud'
WHEN 4 THEN 'aol'
WHEN 5 THEN 'usa'
WHEN 6 THEN 'govt'
WHEN 7 THEN 'Whosebug'
WHEN 8 THEN 'random'
WHEN 9 THEN 'domain'
WHEN 10 THEN 'subby'
WHEN 11 THEN 'youtube'
WHEN 12 THEN 'google'
WHEN 13 THEN 'triple'
WHEN 14 THEN 'pixar'
END
) || '.com' AS email,
'2005-01-01' as date
from generate_series(1, 500) s(i);
我想因为你使用了 dense_rank
,你有重复的排名,总记录一直在增加,如下所示 table:
总记录数:13行
| agegroup | domain | r |
| -------- | ------------------ | -- |
| 66-up | youtube.com | 1 |
| 66-up | triple.com | 2 | <-- duplicate
| 66-up | google.com | 2 | <-- duplicate
| 66-up | random.com | 3 |
| 66-up | usa.com | 4 |
| 66-up | aol.com | 5 | <-- duplicate
| 66-up | subby.com | 5 | <-- duplicate
| 66-up | hotmail.com | 5 | <-- duplicate
| 66-up | whosebug.com | 6 |
| 66-up | apple.com | 7 |
| 66-up | domain.com | 8 |
| 66-up | icloud.com | 9 |
| 66-up | govt.com | 10 |
您的查询有两个问题:
你应该使用row_number
因为dense_rank
添加重复排名当你使用r <= 10
如果记录中存在重复r时,每个组的总记录一直在增加
windows函数中的第二个问题,你必须为每个组使用partition by agegroup
,因为需要为每个组创建排名
with users as (
select a.*,
extract(year from age(dob)) as age,
substr(email, position('@' in email)+1, 1000) as domain
from user_table a
),
useragegroup as (
select a.*,
case when age between 0 and 18 then '0-18'
when age between 19 and 29 then '19-29'
when age between 30 and 49 then '30-49'
when age between 50 and 65 then '50-65'
else '66-up'
end agegroup
from users a
),
rank as (
select agegroup, domain,
row_number() over (partition by agegroup order by count(*) desc) r
from useragegroup a
group by agegroup, domain
)
select a.*
from rank a
where r <= 10;
您的查询可能没问题。看起来有问题,但没有什么特别突出的。但是你确实有问题。您期望在结果中获得 50 行。我想这将是非常罕见的。主要的事情是 rank
和 dense_rank
都不会生成唯一值,如果要排名的值在多行中相同,则每一行都会获得相同的 RANK。不同之处在于排名将跳过值,而 dense_rank 则不会。 IE。如果前两行具有相同的值,而第三行具有不同的值,则以下内容成立:
+------------+-------------+------+------------+
| Row_number | Count_Value | Rank | Dense_Rank |
+------------+-------------+------+------------+
| 1 | 12 | 1 | 1 |
| 2 | 12 | 1 | 1 |
| 3 | 14 | 3 | 2 |
+------------+-------------+------+------------+
查看带有 “”您的数据 here 的演示。它包括 rank
(rnk) 和 dense_rank
(drnk) 的列。向下扫描 rnk and/or drnk 以找到您感兴趣的 age_group,然后转到 row_num。那是 age_group 的 return 行数。请注意,某些 age_group 的 drnk 列不会达到 10;这些将 return 全部 15。假设随机域选择为每个域生成一行。虽然很有可能并不能保证这一点。
顺便说一句:我的查询。我为 age_groups 创建了一个 table,它也在演示中。
select domain, ag_name, dom_cnt, rnk, drnk
from ( -- rank each group by iten count
select domain, ag_name, dom_cnt
, rank() over (partition by ag_name order by dom_cnt desc) rnk
, dense_rank() over (partition by ag_name order by dom_cnt desc) drnk
, row_number() over (partition by ag_name order by dom_cnt desc) row_num
from ( -- count #items for each edomain, ag_name
select domain, ag_name ,count(*) dom_cnt
from (-- extract email domain and group name
select substr(email, position('@' in email)+1) as domain, ag.ag_name
from age_groups ag
join user_table ut
on (extract(year from age(ut.dob)))::int4 <@ ag.ag_range
) agdom
group by ag_name, domain
) dom_cnt
) dom_rank
-- where rnk <= 10
;
给定包含电子邮件地址的 user_table
,我们需要按年龄段划分的 'top 10' 域列表。
所以对于每个组,我应该获得前 10 名的排名。 (即 50 行)。
到目前为止我有什么(我正在使用 Postgres)。 这似乎很接近,但我认为并列排名正在被蚕食。我不会返回 50 行。我得到 12,这基本上似乎排名 1-10,有 2 个平局。都是同龄人。如果我将它增加到 r<=30,那么我会得到更多结果 (71),包括不同年龄组,但每组超过 10 个(每组 10-15 个)。
with users as (
select a.*,
extract(year from age(dob)) age,
substr(email, position('@' in email)+1, 1000) domain
from user_table a
),
useragegroup as (
select a.*,
case when age between 0 and 18 then '0-18'
when age between 19 and 29 then '19-29'
when age between 30 and 49 then '30-49'
when age between 50 and 65 then '50-65'
else '66-up'
end agegroup
from users a
),
rank as (
select agegroup, domain,
dense_rank() over (order by count(*) desc) r
from useragegroup a
group by agegroup, domain
)
select a.*
from rank a
where r<=10;
为了生成一些测试日期,我有: (每组将日期更改 10 年)
insert into user_table (
first, last, email, dob
)
select
left(md5(i::text), 3),
left(md5(random()::text), 3),
'user_' || i || '@' || (
CASE (RANDOM() * 14)::INT
WHEN 0 THEN 'gmail'
WHEN 1 THEN 'hotmail'
WHEN 2 THEN 'apple'
WHEN 3 THEN 'icloud'
WHEN 4 THEN 'aol'
WHEN 5 THEN 'usa'
WHEN 6 THEN 'govt'
WHEN 7 THEN 'Whosebug'
WHEN 8 THEN 'random'
WHEN 9 THEN 'domain'
WHEN 10 THEN 'subby'
WHEN 11 THEN 'youtube'
WHEN 12 THEN 'google'
WHEN 13 THEN 'triple'
WHEN 14 THEN 'pixar'
END
) || '.com' AS email,
'2005-01-01' as date
from generate_series(1, 500) s(i);
我想因为你使用了 dense_rank
,你有重复的排名,总记录一直在增加,如下所示 table:
总记录数:13行
| agegroup | domain | r |
| -------- | ------------------ | -- |
| 66-up | youtube.com | 1 |
| 66-up | triple.com | 2 | <-- duplicate
| 66-up | google.com | 2 | <-- duplicate
| 66-up | random.com | 3 |
| 66-up | usa.com | 4 |
| 66-up | aol.com | 5 | <-- duplicate
| 66-up | subby.com | 5 | <-- duplicate
| 66-up | hotmail.com | 5 | <-- duplicate
| 66-up | whosebug.com | 6 |
| 66-up | apple.com | 7 |
| 66-up | domain.com | 8 |
| 66-up | icloud.com | 9 |
| 66-up | govt.com | 10 |
您的查询有两个问题:
你应该使用
row_number
因为dense_rank
添加重复排名当你使用r <= 10
如果记录中存在重复r时,每个组的总记录一直在增加windows函数中的第二个问题,你必须为每个组使用
partition by agegroup
,因为需要为每个组创建排名
with users as (
select a.*,
extract(year from age(dob)) as age,
substr(email, position('@' in email)+1, 1000) as domain
from user_table a
),
useragegroup as (
select a.*,
case when age between 0 and 18 then '0-18'
when age between 19 and 29 then '19-29'
when age between 30 and 49 then '30-49'
when age between 50 and 65 then '50-65'
else '66-up'
end agegroup
from users a
),
rank as (
select agegroup, domain,
row_number() over (partition by agegroup order by count(*) desc) r
from useragegroup a
group by agegroup, domain
)
select a.*
from rank a
where r <= 10;
您的查询可能没问题。看起来有问题,但没有什么特别突出的。但是你确实有问题。您期望在结果中获得 50 行。我想这将是非常罕见的。主要的事情是 rank
和 dense_rank
都不会生成唯一值,如果要排名的值在多行中相同,则每一行都会获得相同的 RANK。不同之处在于排名将跳过值,而 dense_rank 则不会。 IE。如果前两行具有相同的值,而第三行具有不同的值,则以下内容成立:
+------------+-------------+------+------------+
| Row_number | Count_Value | Rank | Dense_Rank |
+------------+-------------+------+------------+
| 1 | 12 | 1 | 1 |
| 2 | 12 | 1 | 1 |
| 3 | 14 | 3 | 2 |
+------------+-------------+------+------------+
查看带有 “”您的数据 here 的演示。它包括 rank
(rnk) 和 dense_rank
(drnk) 的列。向下扫描 rnk and/or drnk 以找到您感兴趣的 age_group,然后转到 row_num。那是 age_group 的 return 行数。请注意,某些 age_group 的 drnk 列不会达到 10;这些将 return 全部 15。假设随机域选择为每个域生成一行。虽然很有可能并不能保证这一点。
顺便说一句:我的查询。我为 age_groups 创建了一个 table,它也在演示中。
select domain, ag_name, dom_cnt, rnk, drnk
from ( -- rank each group by iten count
select domain, ag_name, dom_cnt
, rank() over (partition by ag_name order by dom_cnt desc) rnk
, dense_rank() over (partition by ag_name order by dom_cnt desc) drnk
, row_number() over (partition by ag_name order by dom_cnt desc) row_num
from ( -- count #items for each edomain, ag_name
select domain, ag_name ,count(*) dom_cnt
from (-- extract email domain and group name
select substr(email, position('@' in email)+1) as domain, ag.ag_name
from age_groups ag
join user_table ut
on (extract(year from age(ut.dob)))::int4 <@ ag.ag_range
) agdom
group by ag_name, domain
) dom_cnt
) dom_rank
-- where rnk <= 10
;