如何在每一列中找到最常见的值
How to find the most common value in each column
我有一个 table 看起来像这样:
category name1 name2 name3 name4 name5
1 John Sam John Katy Cat
1 John Ivan Bob Andrew Tom
1 Sam Ivan George Bob Tom
2 Jack Siri Elsa Noah Anna
2 Jack Bob Tomas Noah Tom
我需要做的是在每个类别的每个列中找到最常见的值。也就是说,我需要以下结果:
category name1 name2 name3 name4 name5
1 John Ivan John Katy Tom
2 Jack Siri Elsa Noah Anna
如果有多个值具有相同的频率,则可以选择其中的任何一个。
到目前为止,我只用这个脚本为一个专栏做到了这一点:
SELECT top(1) category, name1, COUNT(name1) AS freq
FROM data
GROUP BY category, name1
ORDER BY freq DESC
但是我如何为 SQL 服务器中的多个列执行此操作?
首先创建一个CTE,使用COUNT()
window 函数return 每个名称在每个类别中出现的次数,然后使用FIRST_VALUE()
window 函数为每列获取出现次数最多的名称:
WITH cte AS (
SELECT *,
COUNT(*) OVER (PARTITION BY category, name1) count1,
COUNT(*) OVER (PARTITION BY category, name2) count2,
COUNT(*) OVER (PARTITION BY category, name3) count3,
COUNT(*) OVER (PARTITION BY category, name4) count4,
COUNT(*) OVER (PARTITION BY category, name5) count5
FROM tablename
)
SELECT DISTINCT category,
FIRST_VALUE(name1) OVER (PARTITION BY category ORDER BY count1 DESC) name1,
FIRST_VALUE(name2) OVER (PARTITION BY category ORDER BY count2 DESC) name2,
FIRST_VALUE(name3) OVER (PARTITION BY category ORDER BY count3 DESC) name3,
FIRST_VALUE(name4) OVER (PARTITION BY category ORDER BY count4 DESC) name4,
FIRST_VALUE(name5) OVER (PARTITION BY category ORDER BY count5 DESC) name5
FROM cte
参见demo。
一个选项,有很多重复,但适合您当前的结构...
(尽管在处理同样频繁的名称时它仍然具有相同的 non-deterministic/arbitrary 行为)
WITH
counted AS
(
SELECT
category,
name1,
COUNT(*) OVER (PARTITION BY category, name1) AS name1_freq,
name2,
COUNT(*) OVER (PARTITION BY category, name2) AS name2_freq
FROM
yourTable
),
ranked AS
(
SELECT
category,
name1,
ROW_NUMBER() OVER (PARTITION BY category ORDER BY name1_freq DESC) AS name1_rank,
name2,
ROW_NUMBER() OVER (PARTITION BY category ORDER BY name2_freq DESC) AS name2_rank
FROM
counted
)
SELECT
category,
MAX(CASE WHEN name1_rank = 1 THEN name1 END) AS name1_most_common,
MAX(CASE WHEN name2_rank = 1 THEN name2 END) AS name2_most_common
FROM
ranked
GROUP BY
category
如您所见,有很多重复。这就是为什么 SQL 以规范化数据结构为基础的原因。如此之多以至于规范化然后反规范化你的结构可能是一个有效的选择......
WITH
normalised(category, col, name) AS
(
SELECT category, 1, name1 FROM yourTable
UNION ALL SELECT category, 2, name2 FROM yourTable
),
counted AS
(
SELECT
category, col, name, COUNT(*) AS freq
FROM
normalised
GROUP BY
category, col, name
),
ranked AS
(
SELECT
category, col, name,
ROW_NUMBER() OVER (PARTITION BY category, col ORDER BY freq DESC) AS rank
FROM
counted
)
SELECT
category,
MAX(CASE WHEN col = 1 THEN name END) AS name1_most_common,
MAX(CASE WHEN col = 2 THEN name END) AS name2_most_common
FROM
ranked
WHERE
rank = 1
GROUP BY
category
如果您不介意行中的结果,您可以逆透视,这样会更简单:
select category, which, name
from (select t.category, v.which, v.name, count(*) as cnt,
row_number() over (partition by t.category, v.which order by count(*) desc) as seqnum
from t cross apply
(values (1, name1), (2, name2), (3, name3), (4, name4), (5, name4)
) v(which, name)
from t
group by t.category, v.which, v.name
) cwn
where seqnum = 1;
如果你想在列中重新透视,你可以重新透视:
with cwn as (
select t.category, v.which, v.name, count(*) as cnt,
row_number() over (partition by t.category, v.which order by count(*) desc) as seqnum
from t cross apply
(values (1, name1), (2, name2), (3, name3), (4, name4), (5, name4)
) v(which, name)
from t
group by t.category, v.which, v.name
)
select category,
max(case when which = 1 then name end) as name1,
max(case when which = 2 then name end) as name2,
max(case when which = 3 then name end) as name3,
max(case when which = 4 then name end) as name4,
max(case when which = 5 then name end) as name5
from cwn
where seqnum = 1
group by category
我有一个 table 看起来像这样:
category name1 name2 name3 name4 name5
1 John Sam John Katy Cat
1 John Ivan Bob Andrew Tom
1 Sam Ivan George Bob Tom
2 Jack Siri Elsa Noah Anna
2 Jack Bob Tomas Noah Tom
我需要做的是在每个类别的每个列中找到最常见的值。也就是说,我需要以下结果:
category name1 name2 name3 name4 name5
1 John Ivan John Katy Tom
2 Jack Siri Elsa Noah Anna
如果有多个值具有相同的频率,则可以选择其中的任何一个。
到目前为止,我只用这个脚本为一个专栏做到了这一点:
SELECT top(1) category, name1, COUNT(name1) AS freq
FROM data
GROUP BY category, name1
ORDER BY freq DESC
但是我如何为 SQL 服务器中的多个列执行此操作?
首先创建一个CTE,使用COUNT()
window 函数return 每个名称在每个类别中出现的次数,然后使用FIRST_VALUE()
window 函数为每列获取出现次数最多的名称:
WITH cte AS (
SELECT *,
COUNT(*) OVER (PARTITION BY category, name1) count1,
COUNT(*) OVER (PARTITION BY category, name2) count2,
COUNT(*) OVER (PARTITION BY category, name3) count3,
COUNT(*) OVER (PARTITION BY category, name4) count4,
COUNT(*) OVER (PARTITION BY category, name5) count5
FROM tablename
)
SELECT DISTINCT category,
FIRST_VALUE(name1) OVER (PARTITION BY category ORDER BY count1 DESC) name1,
FIRST_VALUE(name2) OVER (PARTITION BY category ORDER BY count2 DESC) name2,
FIRST_VALUE(name3) OVER (PARTITION BY category ORDER BY count3 DESC) name3,
FIRST_VALUE(name4) OVER (PARTITION BY category ORDER BY count4 DESC) name4,
FIRST_VALUE(name5) OVER (PARTITION BY category ORDER BY count5 DESC) name5
FROM cte
参见demo。
一个选项,有很多重复,但适合您当前的结构...
(尽管在处理同样频繁的名称时它仍然具有相同的 non-deterministic/arbitrary 行为)
WITH
counted AS
(
SELECT
category,
name1,
COUNT(*) OVER (PARTITION BY category, name1) AS name1_freq,
name2,
COUNT(*) OVER (PARTITION BY category, name2) AS name2_freq
FROM
yourTable
),
ranked AS
(
SELECT
category,
name1,
ROW_NUMBER() OVER (PARTITION BY category ORDER BY name1_freq DESC) AS name1_rank,
name2,
ROW_NUMBER() OVER (PARTITION BY category ORDER BY name2_freq DESC) AS name2_rank
FROM
counted
)
SELECT
category,
MAX(CASE WHEN name1_rank = 1 THEN name1 END) AS name1_most_common,
MAX(CASE WHEN name2_rank = 1 THEN name2 END) AS name2_most_common
FROM
ranked
GROUP BY
category
如您所见,有很多重复。这就是为什么 SQL 以规范化数据结构为基础的原因。如此之多以至于规范化然后反规范化你的结构可能是一个有效的选择......
WITH
normalised(category, col, name) AS
(
SELECT category, 1, name1 FROM yourTable
UNION ALL SELECT category, 2, name2 FROM yourTable
),
counted AS
(
SELECT
category, col, name, COUNT(*) AS freq
FROM
normalised
GROUP BY
category, col, name
),
ranked AS
(
SELECT
category, col, name,
ROW_NUMBER() OVER (PARTITION BY category, col ORDER BY freq DESC) AS rank
FROM
counted
)
SELECT
category,
MAX(CASE WHEN col = 1 THEN name END) AS name1_most_common,
MAX(CASE WHEN col = 2 THEN name END) AS name2_most_common
FROM
ranked
WHERE
rank = 1
GROUP BY
category
如果您不介意行中的结果,您可以逆透视,这样会更简单:
select category, which, name
from (select t.category, v.which, v.name, count(*) as cnt,
row_number() over (partition by t.category, v.which order by count(*) desc) as seqnum
from t cross apply
(values (1, name1), (2, name2), (3, name3), (4, name4), (5, name4)
) v(which, name)
from t
group by t.category, v.which, v.name
) cwn
where seqnum = 1;
如果你想在列中重新透视,你可以重新透视:
with cwn as (
select t.category, v.which, v.name, count(*) as cnt,
row_number() over (partition by t.category, v.which order by count(*) desc) as seqnum
from t cross apply
(values (1, name1), (2, name2), (3, name3), (4, name4), (5, name4)
) v(which, name)
from t
group by t.category, v.which, v.name
)
select category,
max(case when which = 1 then name end) as name1,
max(case when which = 2 then name end) as name2,
max(case when which = 3 then name end) as name3,
max(case when which = 4 then name end) as name4,
max(case when which = 5 then name end) as name5
from cwn
where seqnum = 1
group by category