替代 union 和 where 语句到 bucket age 和 group by a column SQL Impala
Alternative to union and where statement to bucket age and group by a column SQL Impala
给出一个 table 赞:
+----+-----------+------------------+
| id | code | age |
+----+-----------+------------------+
| 1 | 315.32000 | 2.18430371791803 |
| 1 | 315.32000 | 3.18430371791803 |
| 1 | 800.00000 | 2.18430371791803 |
| 2 | 315.32000 | 5.64822705794013 |
| 3 | 800.00000 | 5.68655778752176 |
| 3 | 120.12000 | 5.70572315231258 |
| 4 | 315.32000 | 5.72488851710339 |
| 4 | 315.32000 | 5.74405388189421 |
| 5 | 120.12000 | 5.7604813374292 |
| 6 | 315.32000 | 5.77993740687426 |
| .. | ... | ... |
+----+-----------+------------------+
我正在使用:
SELECT code, COUNT(*) AS code_frequency,'0-10' AS age_range
FROM table
WHERE age >= 0 AND age < 10
GROUP BY code
ORDER BY code_frequency DESC LIMIT 1
UNION
SELECT code, COUNT(*) AS code_frequency,'10-20' AS age_range
FROM table
WHERE age >= 10 AND age < 20
GROUP BY code
ORDER BY code_frequency DESC LIMIT 1
UNION
...
ORDER BY age_range
我用略有不同的年龄范围和逻辑多次重复此操作,以输出一个 table,显示每个年龄组的最频繁代码及其频率。
输出:
+-----------+-----------+-----------+
| code | frequency | age_range |
+-----------+-----------+-----------+
| 315.32000 | 99832 | 0-10 |
| 800.00000 | 45223 | 10-20 |
| ... | ... | ... |
+-----------+-----------+-----------+
是否有更有效的方法来实现相同的输出,而无需重复相同的代码块和使用 union?
干杯
您可以使用带有 case
表达式的聚合来定义组:
select
code,
count(*) frequency,
case
when age >= 0 and age < 10 then '0-10'
when age >= 10 and age < 20 then '10-20'
end age_range
from mytable
group by code, age_range
order by age_range
如果您想要每个年龄段最频繁的代码,那么您可以在查询之上使用 row_number()
:
select code, frequence, age_range
from (
select
t.*,
row_number() over(partition by age_range order by frequency desc) rn
from (
select
code,
count(*) frequency,
case
when age >= 0 and age < 10 then '0-10'
when age >= 10 and age < 20 then '10-20'
end age_range
from mytable
group by code, age_range
) t
) t
where rn = 1
或者更好:
select code, frequence, age_range
from (
select
code,
count(*) frequency,
case
when age >= 0 and age < 10 then '0-10'
when age >= 10 and age < 20 then '10-20'
end age_range,
row_number() over(partition by min(age) order by count(*) desc) rn
from mytable
group by code, age_range
) t
where rn = 1
给出一个 table 赞:
+----+-----------+------------------+
| id | code | age |
+----+-----------+------------------+
| 1 | 315.32000 | 2.18430371791803 |
| 1 | 315.32000 | 3.18430371791803 |
| 1 | 800.00000 | 2.18430371791803 |
| 2 | 315.32000 | 5.64822705794013 |
| 3 | 800.00000 | 5.68655778752176 |
| 3 | 120.12000 | 5.70572315231258 |
| 4 | 315.32000 | 5.72488851710339 |
| 4 | 315.32000 | 5.74405388189421 |
| 5 | 120.12000 | 5.7604813374292 |
| 6 | 315.32000 | 5.77993740687426 |
| .. | ... | ... |
+----+-----------+------------------+
我正在使用:
SELECT code, COUNT(*) AS code_frequency,'0-10' AS age_range
FROM table
WHERE age >= 0 AND age < 10
GROUP BY code
ORDER BY code_frequency DESC LIMIT 1
UNION
SELECT code, COUNT(*) AS code_frequency,'10-20' AS age_range
FROM table
WHERE age >= 10 AND age < 20
GROUP BY code
ORDER BY code_frequency DESC LIMIT 1
UNION
...
ORDER BY age_range
我用略有不同的年龄范围和逻辑多次重复此操作,以输出一个 table,显示每个年龄组的最频繁代码及其频率。
输出:
+-----------+-----------+-----------+
| code | frequency | age_range |
+-----------+-----------+-----------+
| 315.32000 | 99832 | 0-10 |
| 800.00000 | 45223 | 10-20 |
| ... | ... | ... |
+-----------+-----------+-----------+
是否有更有效的方法来实现相同的输出,而无需重复相同的代码块和使用 union?
干杯
您可以使用带有 case
表达式的聚合来定义组:
select
code,
count(*) frequency,
case
when age >= 0 and age < 10 then '0-10'
when age >= 10 and age < 20 then '10-20'
end age_range
from mytable
group by code, age_range
order by age_range
如果您想要每个年龄段最频繁的代码,那么您可以在查询之上使用 row_number()
:
select code, frequence, age_range
from (
select
t.*,
row_number() over(partition by age_range order by frequency desc) rn
from (
select
code,
count(*) frequency,
case
when age >= 0 and age < 10 then '0-10'
when age >= 10 and age < 20 then '10-20'
end age_range
from mytable
group by code, age_range
) t
) t
where rn = 1
或者更好:
select code, frequence, age_range
from (
select
code,
count(*) frequency,
case
when age >= 0 and age < 10 then '0-10'
when age >= 10 and age < 20 then '10-20'
end age_range,
row_number() over(partition by min(age) order by count(*) desc) rn
from mytable
group by code, age_range
) t
where rn = 1