删除空值较少的重复项
Remove duplicates with less null values
我有 table 个员工,其中包含大约 25 列。现在有很多重复项,我想尝试删除其中一些重复项。
首先,我想通过查找在名字、姓氏、员工编号、公司编号和状态中具有相同值的多条记录来查找重复项。
SELECT
firstname,lastname,employeenumber, companynumber, statusflag
FROM
employeemaster
GROUP BY
firstname,lastname,employeenumber,companynumber, statusflag
HAVING
(COUNT(*) > 1)
这给了我重复项,但我的目标是找到并保留最好的单个记录并删除其他记录。 "best single record" 由所有其他列中具有最少 NULL 值的记录定义。我该怎么做?
我正在使用 Microsoft SQL Server 2012 MGMT Studio。
示例:
红色:删除
绿色:保持
注意:table 中的列比此 table 显示的要多得多。
试试这个。
;WITH cte
AS (SELECT Row_number()
OVER(
partition BY firstname, lastname, employeenumber, companynumber, statusflag
ORDER BY (SELECT NULL)) rn,
firstname,
lastname,
employeenumber,
companynumber,
statusflag,
username,
branch
FROM employeemaster),
cte1
AS (SELECT a.firstname,
a.lastname,
a.employeenumber,
a.companynumber,
a.statusflag,
Row_number()
OVER(
partition BY a.firstname, a.lastname, a.employeenumber, a.companynumber, a.statusflag
ORDER BY (CASE WHEN a.username IS NULL THEN 1 ELSE 0 END +CASE WHEN a.branch IS NULL THEN 1 ELSE 0 END) )rn
-- add the remaining columns in case statement
FROM cte a
JOIN employeemaster b
ON a.firstname = b.firstname
AND a.lastname = b.lastname
AND a.employeenumber = b.employeenumber
AND a.companynumbe = b.companynumber
AND a.statusflag = b.statusflag)
SELECT *
FROM cte1
WHERE rn = 1
您可以使用 sys.columns table 获取列列表并构建动态查询。此查询将为您要根据给定条件保留的每条记录 return 一个 'KeepThese' 值。
-- insert test data
create table EmployeeMaster
(
Record int identity(1,1),
FirstName varchar(50),
LastName varchar(50),
EmployeeNumber int,
CompanyNumber int,
StatusFlag int,
UserName varchar(50),
Branch varchar(50)
);
insert into EmployeeMaster
(
FirstName,
LastName,
EmployeeNumber,
CompanyNumber,
StatusFlag,
UserName,
Branch
)
values
('Jake','Jones',1234,1,1,'JJONES','PHX'),
('Jake','Jones',1234,1,1,NULL,'PHX'),
('Jake','Jones',1234,1,1,NULL,NULL),
('Jane','Jones',5678,1,1,'JJONES2',NULL);
-- get records with most non-null values with dynamic sys.column query
declare @sql varchar(max)
select @sql = '
select e.*,
row_number() over(partition by
e.FirstName,
e.LastName,
e.EmployeeNumber,
e.CompanyNumber,
e.StatusFlag
order by n.NonNullCnt desc) as KeepThese
from EmployeeMaster e
cross apply (select count(n.value) as NonNullCnt from (select ' +
replace((
select 'cast(' + c.name + ' as varchar(50)) as value union all select '
from sys.columns c
where c.object_id = t.object_id
for xml path('')
) + '#',' union all select #','') + ')n)n'
from sys.tables t
where t.name = 'EmployeeMaster'
exec(@sql)
我用 MySQL 测试并使用 NULL String concat 找到最佳记录。因为 LENGTH ( NULL || 'data') 为 0。只有当所有列都不是 NULL 时,才会存在某个长度。也许这还不够完美。
create table EmployeeMaster
(
Record int auto_increment,
FirstName varchar(50),
LastName varchar(50),
EmployeeNumber int,
CompanyNumber int,
StatusFlag int,
UserName varchar(50),
Branch varchar(50),
PRIMARY KEY(record)
);
INSERT INTO EmployeeMaster
(
FirstName, LastName, EmployeeNumber, CompanyNumber, StatusFlag, UserName, Branch
) VALUES ('Jake', 'Jones', 1234, 1, 1, 'JJONES', 'PHX'), ('Jake', 'Jones', 1234, 1, 1, NULL, 'PHX'), ('Jake', 'Jones', 1234, 1, 1, NULL, NULL), ('Jane', 'Jones', 5678, 1, 1, 'JJONES2', NULL);
我的查询思路是这样的
SELECT e.*
FROM employeemaster e
JOIN ( SELECT firstname,
lastname,
employeenumber,
companynumber,
statusflag,
MAX( LENGTH ( username || branch ) ) data_quality
FROM employeemaster
GROUP BY firstname, lastname, employeenumber, companynumber, statusflag
HAVING count(*) > 1
) g
ON LENGTH ( username || branch ) = g.data_quality
我有 table 个员工,其中包含大约 25 列。现在有很多重复项,我想尝试删除其中一些重复项。
首先,我想通过查找在名字、姓氏、员工编号、公司编号和状态中具有相同值的多条记录来查找重复项。
SELECT
firstname,lastname,employeenumber, companynumber, statusflag
FROM
employeemaster
GROUP BY
firstname,lastname,employeenumber,companynumber, statusflag
HAVING
(COUNT(*) > 1)
这给了我重复项,但我的目标是找到并保留最好的单个记录并删除其他记录。 "best single record" 由所有其他列中具有最少 NULL 值的记录定义。我该怎么做?
我正在使用 Microsoft SQL Server 2012 MGMT Studio。
示例:
红色:删除 绿色:保持
注意:table 中的列比此 table 显示的要多得多。
试试这个。
;WITH cte
AS (SELECT Row_number()
OVER(
partition BY firstname, lastname, employeenumber, companynumber, statusflag
ORDER BY (SELECT NULL)) rn,
firstname,
lastname,
employeenumber,
companynumber,
statusflag,
username,
branch
FROM employeemaster),
cte1
AS (SELECT a.firstname,
a.lastname,
a.employeenumber,
a.companynumber,
a.statusflag,
Row_number()
OVER(
partition BY a.firstname, a.lastname, a.employeenumber, a.companynumber, a.statusflag
ORDER BY (CASE WHEN a.username IS NULL THEN 1 ELSE 0 END +CASE WHEN a.branch IS NULL THEN 1 ELSE 0 END) )rn
-- add the remaining columns in case statement
FROM cte a
JOIN employeemaster b
ON a.firstname = b.firstname
AND a.lastname = b.lastname
AND a.employeenumber = b.employeenumber
AND a.companynumbe = b.companynumber
AND a.statusflag = b.statusflag)
SELECT *
FROM cte1
WHERE rn = 1
您可以使用 sys.columns table 获取列列表并构建动态查询。此查询将为您要根据给定条件保留的每条记录 return 一个 'KeepThese' 值。
-- insert test data
create table EmployeeMaster
(
Record int identity(1,1),
FirstName varchar(50),
LastName varchar(50),
EmployeeNumber int,
CompanyNumber int,
StatusFlag int,
UserName varchar(50),
Branch varchar(50)
);
insert into EmployeeMaster
(
FirstName,
LastName,
EmployeeNumber,
CompanyNumber,
StatusFlag,
UserName,
Branch
)
values
('Jake','Jones',1234,1,1,'JJONES','PHX'),
('Jake','Jones',1234,1,1,NULL,'PHX'),
('Jake','Jones',1234,1,1,NULL,NULL),
('Jane','Jones',5678,1,1,'JJONES2',NULL);
-- get records with most non-null values with dynamic sys.column query
declare @sql varchar(max)
select @sql = '
select e.*,
row_number() over(partition by
e.FirstName,
e.LastName,
e.EmployeeNumber,
e.CompanyNumber,
e.StatusFlag
order by n.NonNullCnt desc) as KeepThese
from EmployeeMaster e
cross apply (select count(n.value) as NonNullCnt from (select ' +
replace((
select 'cast(' + c.name + ' as varchar(50)) as value union all select '
from sys.columns c
where c.object_id = t.object_id
for xml path('')
) + '#',' union all select #','') + ')n)n'
from sys.tables t
where t.name = 'EmployeeMaster'
exec(@sql)
我用 MySQL 测试并使用 NULL String concat 找到最佳记录。因为 LENGTH ( NULL || 'data') 为 0。只有当所有列都不是 NULL 时,才会存在某个长度。也许这还不够完美。
create table EmployeeMaster
(
Record int auto_increment,
FirstName varchar(50),
LastName varchar(50),
EmployeeNumber int,
CompanyNumber int,
StatusFlag int,
UserName varchar(50),
Branch varchar(50),
PRIMARY KEY(record)
);
INSERT INTO EmployeeMaster
(
FirstName, LastName, EmployeeNumber, CompanyNumber, StatusFlag, UserName, Branch
) VALUES ('Jake', 'Jones', 1234, 1, 1, 'JJONES', 'PHX'), ('Jake', 'Jones', 1234, 1, 1, NULL, 'PHX'), ('Jake', 'Jones', 1234, 1, 1, NULL, NULL), ('Jane', 'Jones', 5678, 1, 1, 'JJONES2', NULL);
我的查询思路是这样的
SELECT e.*
FROM employeemaster e
JOIN ( SELECT firstname,
lastname,
employeenumber,
companynumber,
statusflag,
MAX( LENGTH ( username || branch ) ) data_quality
FROM employeemaster
GROUP BY firstname, lastname, employeenumber, companynumber, statusflag
HAVING count(*) > 1
) g
ON LENGTH ( username || branch ) = g.data_quality