为几列取最常出现的(模态)值
Taking the most commonly occuring (modal) value for several columns
我正在为同一个人清理随着时间的推移记录不佳且不一致的社会人口统计信息的记录。我想为每个人取最常出现的值(众数)。
一种方法是按 id 进行分区,然后计算每个值出现的次数,保留每个 id 的最高计数:
DROP TABLE dbo.table
SELECT DISTINCT [id], [ethnic_group] AS [ethnic_mode], ct INTO dbo.table
FROM (
SELECT row_number() OVER (PARTITION BY [id] ORDER BY count([ethnic_group]) DESC) as rn, count([ethnic_group]) as ct, [ethnic_group], [id]
FROM
dbo.mytable GROUP BY [id], [ethnic_group]) ranked
where rn = 1
ORDER BY ct DESC
但我想对几个变量(种族群体、收入群体等)执行此操作。
如何 select 一个语句中多个变量的模式并插入一个 table(而不是为每个变量创建单独的 table)?
下面的 table 举例说明了我想做的事情:
DROP TABLE mytable;
CREATE TABLE mytable(
id VARCHAR(2) NOT NULL PRIMARY KEY
,ethnic_group VARCHAR(12) NOT NULL
,ethnic_mode VARCHAR(11) NOT NULL
,income VARCHAR(6) NOT NULL
,income_mode VARCHAR(11) NOT NULL
);
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('id','ethnic_group','ethnic_mode','income','income_mode');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','white','white','middle','middle');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','white','white','middle','middle');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','mixed','white','high','middle');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','asian','asian','middle','middle');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','mixed','asian','middle','middle');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','asian','asian','middle','middle');
我会使用子查询在 1 个插入语句中完成此操作。
这是一个基于您插图中 table 结构的示例:
/* This is the original table and contains duplicate ID's */
DECLARE @source_table TABLE(
id VARCHAR(2) NOT NULL
,ethnic_group VARCHAR(12) NULL
,ethnic_mode VARCHAR(11) NULL
,income VARCHAR(6) NULL
,income_mode VARCHAR(11) NULL
);
/* This is the destination table and will not contain duplicate ID's */
DECLARE @destination_table TABLE(
id VARCHAR(2) NOT NULL PRIMARY KEY
,ethnic_group VARCHAR(12) NULL
,income VARCHAR(6) NULL
);
/* Populate the source table with data */
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','white','white','middle','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','white','white','middle','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','mixed','white','high','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','asian','asian','middle','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','mixed','asian','middle','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','asian','asian','middle','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('3','asian', NULL, NULL, NULL);
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('3',NULL, NULL,'middle', NULL);
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('3',NULL, NULL, NULL, NULL);
/* Insert from source into destination (removing duplicates) */
INSERT INTO @destination_table
(
id
, ethnic_group
, income
)
SELECT st.id
, (
SELECT TOP 1 ethnic_group
FROM @source_table sub_st
WHERE sub_st.id = st.id
GROUP BY ethnic_group
ORDER BY COUNT(sub_st.id) DESC
)
, (
SELECT TOP 1 income
FROM @source_table sub_st
WHERE sub_st.id = st.id
GROUP BY income
ORDER BY COUNT(sub_st.id) DESC
)
FROM @source_table st
GROUP BY st.id
/* View the destination to see there are no duplicates */
SELECT id
, ethnic_group
, income
FROM @destination_table
我正在为同一个人清理随着时间的推移记录不佳且不一致的社会人口统计信息的记录。我想为每个人取最常出现的值(众数)。
一种方法是按 id 进行分区,然后计算每个值出现的次数,保留每个 id 的最高计数:
DROP TABLE dbo.table
SELECT DISTINCT [id], [ethnic_group] AS [ethnic_mode], ct INTO dbo.table
FROM (
SELECT row_number() OVER (PARTITION BY [id] ORDER BY count([ethnic_group]) DESC) as rn, count([ethnic_group]) as ct, [ethnic_group], [id]
FROM
dbo.mytable GROUP BY [id], [ethnic_group]) ranked
where rn = 1
ORDER BY ct DESC
但我想对几个变量(种族群体、收入群体等)执行此操作。
如何 select 一个语句中多个变量的模式并插入一个 table(而不是为每个变量创建单独的 table)?
下面的 table 举例说明了我想做的事情:
DROP TABLE mytable;
CREATE TABLE mytable(
id VARCHAR(2) NOT NULL PRIMARY KEY
,ethnic_group VARCHAR(12) NOT NULL
,ethnic_mode VARCHAR(11) NOT NULL
,income VARCHAR(6) NOT NULL
,income_mode VARCHAR(11) NOT NULL
);
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('id','ethnic_group','ethnic_mode','income','income_mode');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','white','white','middle','middle');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','white','white','middle','middle');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','mixed','white','high','middle');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','asian','asian','middle','middle');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','mixed','asian','middle','middle');
INSERT INTO mytable(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','asian','asian','middle','middle');
我会使用子查询在 1 个插入语句中完成此操作。
这是一个基于您插图中 table 结构的示例:
/* This is the original table and contains duplicate ID's */
DECLARE @source_table TABLE(
id VARCHAR(2) NOT NULL
,ethnic_group VARCHAR(12) NULL
,ethnic_mode VARCHAR(11) NULL
,income VARCHAR(6) NULL
,income_mode VARCHAR(11) NULL
);
/* This is the destination table and will not contain duplicate ID's */
DECLARE @destination_table TABLE(
id VARCHAR(2) NOT NULL PRIMARY KEY
,ethnic_group VARCHAR(12) NULL
,income VARCHAR(6) NULL
);
/* Populate the source table with data */
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','white','white','middle','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','white','white','middle','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('1','mixed','white','high','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','asian','asian','middle','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','mixed','asian','middle','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('2','asian','asian','middle','middle');
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('3','asian', NULL, NULL, NULL);
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('3',NULL, NULL,'middle', NULL);
INSERT INTO @source_table(id,ethnic_group,ethnic_mode,income,income_mode) VALUES ('3',NULL, NULL, NULL, NULL);
/* Insert from source into destination (removing duplicates) */
INSERT INTO @destination_table
(
id
, ethnic_group
, income
)
SELECT st.id
, (
SELECT TOP 1 ethnic_group
FROM @source_table sub_st
WHERE sub_st.id = st.id
GROUP BY ethnic_group
ORDER BY COUNT(sub_st.id) DESC
)
, (
SELECT TOP 1 income
FROM @source_table sub_st
WHERE sub_st.id = st.id
GROUP BY income
ORDER BY COUNT(sub_st.id) DESC
)
FROM @source_table st
GROUP BY st.id
/* View the destination to see there are no duplicates */
SELECT id
, ethnic_group
, income
FROM @destination_table