Select SQL 服务器中重复记录的子集
Select subset of duplicate records in SQL Server
我需要 select SQL Server 2016 中重复记录的一个子集。下面是数据集和使用的代码。我只需要 select 以红色突出显示的重复项。基本上,我只需要那些具有匹配的 LName、FName、dateOfBirth、StreetAddress 值并且在 Source 中为 nave NUll 的重复记录。同时,我只需要那些在上述字段中也匹配并且源值为 "Company XYZ"
的记录
IF OBJECT_ID('tempdb..#Dataset') IS NOT NULL DROP TABLE #Dataset
GO
create table #Dataset
(
ID int not null,
LName varchar(50) null,
Fname varchar(50) null,
DateOfBirth varchar(50) null,
StreetAddress varchar(50) null,
Source varchar(50) null,
)
insert into #Dataset (ID, LName, Fname, DateOfBirth, StreetAddress, Source)
values
('1', 'John', 'Ganske', '37171', ' 1223 Sunrise St', 'Company XYZ'),
('2', 'John', 'Ganske', '37171', ' 1233 Sunrise St', 'Company XYZ'),
('4', 'Brent', 'Paine', '20723', ' 5443 Fox Dr', Null),
('3', 'Brent', 'Paine', '20723', ' 5443 Fox Dr', 'Company XYZ'),
('5', 'Adam', 'Smith', '22805', ' 1254 Lake Ridge Ct', Null),
('6', 'Adam', 'Smith', '22805', ' 1254 Lake Ridge Ct', Null),
('7', 'Adam', 'Smith', '22805', ' 1254 Lake Ridge Ct', 'Company XYZ'),
('8', 'Timothy', 'Johnson', '36165', ' 1278 Lee H-W', Null),
('9', 'Timothy', 'Johnson', '36165', ' 1278 Lee H-W', Null),
('10', 'Judy', 'Wilson', '32579', ' 5678 Dotties Dr', 'Company XYZ'),
('12', 'Peter', 'Pan', '37507', NULL, Null),
('11', 'Peter', 'Pan', '37507', NULL, 'Company XYZ');
--select * from #Dataset
select d.ID, d.LName, d.Fname, d.DateOfBirth, d.StreetAddress, d.Source
from #Dataset d
inner join (select
LName, Fname, DateOfBirth, StreetAddress
from #Dataset
--where Source is not null
group by
LName, Fname, DateOfBirth, StreetAddress
having count(*) > 1 ) b
on d.LName = b.LName
and
d.Fname = b.Fname
and
d.DateOfBirth = b.DateOfBirth
and
d.StreetAddress = b.StreetAddress
left outer join (select min(ID) as ID from #Dataset
group by LName, Fname, DateOfBirth, StreetAddress
having count(*) > 1 ) c
on d.ID = c.ID
我的输出如下所示:
您可以使用 ROW_NUMBER
:
WITH cte AS (
SELECT *,ROW_NUMBER() OVER(PARTITION BY LName,Fname,DateOfBirth,StreetAddress
ORDER BY ID DESC) rn
FROM #Dataset
)
SELECT *
FROM cte
WHERE rn > 1
ORDER BY ID;
编辑:
WITH cte AS (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress
ORDER BY ID DESC) rn,
SUM(CASE WHEN Source = 'Company XYZ' THEN 1 ELSE 0 END)
OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) AS cnt
FROM #Dataset
)
SELECT *
FROM cte
WHERE rn > 1
AND cnt > 0
AND [Source] IS NULL
ORDER BY ID;
编辑 2:
WITH cte AS (
SELECT *,
SUM(CASE WHEN Source IS NULL THEN 1 ELSE 0 END) OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) c1,
SUM(CASE WHEN Source = 'Company XYZ' THEN 1 ELSE 0 END) OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) AS c2,
COUNT(*) OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) c3
FROM #Dataset
)
SELECT *
FROM cte
WHERE c1 > 0
AND c2 > 0
AND c3 > 1
AND Source IS NULL
ORDER BY ID;
我需要 select SQL Server 2016 中重复记录的一个子集。下面是数据集和使用的代码。我只需要 select 以红色突出显示的重复项。基本上,我只需要那些具有匹配的 LName、FName、dateOfBirth、StreetAddress 值并且在 Source 中为 nave NUll 的重复记录。同时,我只需要那些在上述字段中也匹配并且源值为 "Company XYZ"
的记录IF OBJECT_ID('tempdb..#Dataset') IS NOT NULL DROP TABLE #Dataset
GO
create table #Dataset
(
ID int not null,
LName varchar(50) null,
Fname varchar(50) null,
DateOfBirth varchar(50) null,
StreetAddress varchar(50) null,
Source varchar(50) null,
)
insert into #Dataset (ID, LName, Fname, DateOfBirth, StreetAddress, Source)
values
('1', 'John', 'Ganske', '37171', ' 1223 Sunrise St', 'Company XYZ'),
('2', 'John', 'Ganske', '37171', ' 1233 Sunrise St', 'Company XYZ'),
('4', 'Brent', 'Paine', '20723', ' 5443 Fox Dr', Null),
('3', 'Brent', 'Paine', '20723', ' 5443 Fox Dr', 'Company XYZ'),
('5', 'Adam', 'Smith', '22805', ' 1254 Lake Ridge Ct', Null),
('6', 'Adam', 'Smith', '22805', ' 1254 Lake Ridge Ct', Null),
('7', 'Adam', 'Smith', '22805', ' 1254 Lake Ridge Ct', 'Company XYZ'),
('8', 'Timothy', 'Johnson', '36165', ' 1278 Lee H-W', Null),
('9', 'Timothy', 'Johnson', '36165', ' 1278 Lee H-W', Null),
('10', 'Judy', 'Wilson', '32579', ' 5678 Dotties Dr', 'Company XYZ'),
('12', 'Peter', 'Pan', '37507', NULL, Null),
('11', 'Peter', 'Pan', '37507', NULL, 'Company XYZ');
--select * from #Dataset
select d.ID, d.LName, d.Fname, d.DateOfBirth, d.StreetAddress, d.Source
from #Dataset d
inner join (select
LName, Fname, DateOfBirth, StreetAddress
from #Dataset
--where Source is not null
group by
LName, Fname, DateOfBirth, StreetAddress
having count(*) > 1 ) b
on d.LName = b.LName
and
d.Fname = b.Fname
and
d.DateOfBirth = b.DateOfBirth
and
d.StreetAddress = b.StreetAddress
left outer join (select min(ID) as ID from #Dataset
group by LName, Fname, DateOfBirth, StreetAddress
having count(*) > 1 ) c
on d.ID = c.ID
我的输出如下所示:
您可以使用 ROW_NUMBER
:
WITH cte AS (
SELECT *,ROW_NUMBER() OVER(PARTITION BY LName,Fname,DateOfBirth,StreetAddress
ORDER BY ID DESC) rn
FROM #Dataset
)
SELECT *
FROM cte
WHERE rn > 1
ORDER BY ID;
编辑:
WITH cte AS (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress
ORDER BY ID DESC) rn,
SUM(CASE WHEN Source = 'Company XYZ' THEN 1 ELSE 0 END)
OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) AS cnt
FROM #Dataset
)
SELECT *
FROM cte
WHERE rn > 1
AND cnt > 0
AND [Source] IS NULL
ORDER BY ID;
编辑 2:
WITH cte AS (
SELECT *,
SUM(CASE WHEN Source IS NULL THEN 1 ELSE 0 END) OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) c1,
SUM(CASE WHEN Source = 'Company XYZ' THEN 1 ELSE 0 END) OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) AS c2,
COUNT(*) OVER(PARTITION BY LName, Fname, DateOfBirth, StreetAddress) c3
FROM #Dataset
)
SELECT *
FROM cte
WHERE c1 > 0
AND c2 > 0
AND c3 > 1
AND Source IS NULL
ORDER BY ID;