如何使用 RANK 对匹配记录进行分组
How to use RANK to Group Matched Records
长话短说。我有数据试图通过地址识别重复记录。地址可以在 [Address]
或 [Remit_Address]
字段上匹配。
我使用 JOIN 和 UNION 来获取记录,但我需要匹配的记录在结果中相互显示。
我无法按任何现有字段排序,因此典型的 'ORDER BY' 不起作用。我按照某人的建议查看了 RANK
,看起来它可能有效,但我不知道如何进行分区,我认为该命令给我带来了与 ORDER BY
相同的问题。
如果 RANK 不是最佳选择,我愿意接受其他想法。最终目标是以某种方式对匹配的记录进行分组。
- SSMS 18
- SQL 服务器 2019
设置如下:
-- Output Table
CREATE TABLE [dupecheck] (
[id] int identity(1, 1),
[Data Area] varchar(255),
[Supplier_No] varchar(255),
[Name] varchar(255),
[Address] varchar(255),
[City] varchar(255),
[State] varchar(255),
[Zip] varchar(255),
[Country] varchar(255),
[Remit_Address] varchar(255),
[Remit_City] varchar(255),
[Remit_State] varchar(255),
[Remit_Zip] varchar(255),
[Remit_Country] varchar(255),
)
CREATE TABLE [sample_data] (
[Supplier_No] varchar(255),
[Name] varchar(255),
[Address] varchar(255),
[City] varchar(255),
[State] varchar(255),
[Zip] varchar(255),
[Country] varchar(255),
[Remit_Address] varchar(255),
[Remit_City] varchar(255),
[Remit_State] varchar(255),
[Remit_Zip] varchar(255),
[Remit_Country] varchar(255),
[cleanAddress] varchar(255),
[cleanRemit_Address] varchar(255),
CONSTRAINT [suppliers_pk] PRIMARY KEY ([Supplier_No])
)
INSERT INTO [sample_data] VALUES
('1039104','Geez Companies','100 Aero Hudson Rd','Streetsboro','OH','44241','','100 Aero Hudson Road','Streetsboro','OH','44241','USA','100 Aero Hudson Rd','100 Aero Hudson Rd'),
('1218409','SouthWestern Medical','100 West Balor Ave','Osceola','AR','72370','USA','SouthWestern Medical100 W Balor Ave','Osceola','AR','72370','USA','100 W Balor Ave','SouthWestern Medical100 W Balor Ave'),
('1243789','SouthWestern Medical','100 West Balor Ave','Osceola','AR','72370','USA','SouthWestern Medical100 West Balor Ave','Osceola','AR','72370','USA','100 W Balor Ave','SouthWestern Medical100 W Balor Ave'),
('1243636','SIRI SYSTEMS','15 BRAD ROAD','WEXFORD','PA','15090','','','','','','','15 BRAD RD',''),
('1152482','FLEETWOOD MACK','22 WINDSOCK CT','ADDISON','IL','60101','','PO BOX 951','CHICAGO','IL','60694-5124','','22 WINDSOCK CT','PO BOX 951'),
('1224483','Aerospace Junction','211500 Communicate Ave','Mingo Junction','OH','43939','USA','P O Box 99','Mingo Junction','OH','43939','USA','211500 Communicate Ave','PO Box 99'),
('1243397','Squeezy Felt','SCHREIBER DIST','NEW KENSINGTON','PA','15068','','','','','','','SCHREIBER DIST',''),
('1230895','NERO CO','28 North US State Highway 99','Osceola','AR','72370','USA','PO Box 204','Cape Girardeau','MO','63702-2045','USA','28 N US State Hwy 99','PO Box 204'),
('1243782','NERO CO','28 North US State Highway 99','Osceola','AR','72370','USA','PO Box 204','Cape Girardeau','MO','63702-2045','USA','28 N US State Hwy 99','PO Box 204'),
('1135880','RICHARD PRYOR SEMINARS','PO BOX 2194','KANSAS CITY','MO','64121-9468','USA','RICHARD PRYOR SEMINARS P O BOX 2194','KANSAS CITY','MO','64121-9468','USA','PO BOX 2194','RICHARD PRYOR SEMINARS PO BOX 2194'),
('1241328','INFINITY AND BEYOND','P.O. BOX 169','GASTONIA','NC','28053-0269','USA','','','','','','PO BOX 169',''),
('1259522','ZEEBO INC','PO BOX 169','GASSTONIA','NC','28053-0269','USA','','','','','','PO BOX 169',''),
('1255253','AT&T','PO Box 50221','Carol Stream','IL','60197','USA','','','','','','PO Box 50221',''),
('1135513','AT&T','PO Box 50221','Carol Stream','IL','60197-5080','USA','','','','','','PO Box 50221',''),
('1119161','Machine Co, Inc','3306 N Thorne Blvd','Chattanooga','TN','','','PO BOX 5301','CHATTANOOGA','TN','37406','USA','3306 N Thorne Blvd','PO BOX 5301'),
('1176587','Topsy Turvy','365 Welmington Road','Chicago','IL','60606','USA','','','','','','365 Welmington Rd',''),
('2156671','Topsy Turvvy, Inc.','P.O. Box 55217','Columbus','OH','43081','','365 Welmington Road','Chicago','IL','60606','USA','','365 Welmington Rd')
CREATE TABLE [dupe_addresses](
[NewAdd] [varchar](255) NULL
)
INSERT INTO [dupe_addresses] VALUES
('100 W Balor Ave'),
('28 N US State Hwy 99'),
('365 Welmington Rd'),
('PO BOX 169'),
('PO Box 204'),
('PO Box 50221'),
('SouthWestern Medical100 W Balor Ave')
现有查询:
INSERT INTO [dupecheck]
SELECT * FROM (
SELECT
'Address Match' AS [Reason],
pv.[Supplier_No],
pv.[Name],
pv.[Address],
pv.[City],
pv.[State],
pv.[Zip],
pv.[Country],
pv.[Remit_Address],
pv.[Remit_City],
pv.[Remit_State],
pv.[Remit_Zip],
pv.[Remit_Country]
FROM [dupe_addresses] n
LEFT JOIN [sample_data] pv
ON
(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] <> '' AND [Address] IS NOT NULL ) )
WHERE ([Supplier_No] IS NOT NULL AND [Supplier_No] <> '')
UNION
SELECT
'Address Match' AS [Reason],
pv.[Supplier_No],
pv.[Name],
pv.[Address],
pv.[City],
pv.[State],
pv.[Zip],
pv.[Country],
pv.[Remit_Address],
pv.[Remit_City],
pv.[Remit_State],
pv.[Remit_Zip],
pv.[Remit_Country]
FROM [dupe_addresses] n
LEFT JOIN [sample_data] pv
ON
(n.[NewAdd] = pv.[cleanRemit_Address] AND ( [Remit_Address] <> '' AND [Remit_Address] IS NOT NULL) )
WHERE ([Supplier_No] IS NOT NULL AND [Supplier_No] <> '')
) q1
当前结果:
Reason Supplier_No Name Address City State Zip Country Remit_Address Remit_City Remit_State Remit_Zip Remit_Country
Address Match 1135513 AT&T PO Box 50221 Carol Stream IL 60197-5080 USA
Address Match 1176587 Topsy Turvy 365 Welmington Road Chicago IL 60606 USA
Address Match 1218409 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 W Balor Ave Osceola AR 72370 USA
Address Match 1230895 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA
Address Match 1241328 INFINITY AND BEYOND P.O. BOX 169 GASTONIA NC 28053-0269 USA
Address Match 1243782 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA
Address Match 1243789 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 West Balor Ave Osceola AR 72370 USA
Address Match 1255253 AT&T PO Box 50221 Carol Stream IL 60197 USA
Address Match 1259522 ZEEBO INC PO BOX 169 GASSTONIA NC 28053-0269 USA
Address Match 2156671 Topsy Turvvy, Inc. P.O. Box 55217 Columbus OH 43081 365 Welmington Road Chicago IL 60606 USA
期望的结果:
Reason Supplier_No Name Address City State Zip Country Remit_Address Remit_City Remit_State Remit_Zip Remit_Country rank
Address Match 1135513 AT&T PO Box 50221 Carol Stream IL 60197-5080 USA 1
Address Match 1255253 AT&T PO Box 50221 Carol Stream IL 60197 USA 1
Address Match 1241328 INFINITY AND BEYOND P.O. BOX 169 GASTONIA NC 28053-0269 USA 2
Address Match 1259522 ZEEBO INC PO BOX 169 GASSTONIA NC 28053-0269 USA 2
Address Match 1243782 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA 3
Address Match 1230895 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA 3
Address Match 1218409 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 W Balor Ave Osceola AR 72370 USA 4
Address Match 1243789 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 West Balor Ave Osceola AR 72370 USA 4
Address Match 2156671 Topsy Turvvy, Inc. P.O. Box 55217 Columbus OH 43081 365 Welmington Road Chicago IL 60606 USA 5
Address Match 1176587 Topsy Turvy 365 Welmington Road Chicago IL 60606 USA 5
此查询创建了所需的结果。
with cte as (
select s2.NewAdd grp, s1.*
, rank() over(partition by Supplier_No order by s2.NewAdd) rnk
from sample_data s1
inner join dupe_addresses s2 on
(s1.cleanAddress=s2.newAdd) or (s1.cleanRemit_Address=s2.newAdd)
)
select c1.*
from cte c1
where rnk = 1
order by c1.grp
删除了 Union ,并改为通过 OR
组合两个连接条件。
所以可能会找到符合这两个条件的记录。
rank()
用于计算结果集分区内每一行的排名。
partition by Supplier_No
用于识别重复记录。
最后用where rnk = 1
查看不重复的记录组
我确定有一种 shorter/cleaner 方法可以做到这一点,但是当我等待我的咖啡开始时,下面应该做你想做的。
SELECT s1.*
,coalesce((
SELECT s1.Cleanaddress
FROM dupe_addresses s2
WHERE s1.cleanAddress = s2.newAdd
), (
SELECT s1.cleanRemit_Address
FROM dupe_addresses s2
WHERE s1.cleanRemit_Address = s2.newAdd
)) AS MatchedAddress
FROM sample_data s1
WHERE EXISTS (
SELECT 1
FROM dupe_addresses s2
WHERE (s1.cleanAddress = s2.newAdd)
OR (s1.cleanRemit_Address = s2.newAdd)
)
ORDER BY MatchedAddress
编辑:我考虑得更多了。我会改变你这样做的方式,因为你说你有更多的匹配标准,这将是实现你想要的更好的方式。基本上我会在你的 supplier/data table 上创建一个 CleanedAddressID,然后将你所有清理过的地址放入一个清理过的地址 table.
完成后,您可以更新 CleanedAddressID,并且可以使用比当前使用更多的criteria/matching。
以下代码应该可以帮到您,最终查询将 return 所有基于地址的重复项。
随着时间的推移,您可以用类似的方式添加不同的匹配项,然后创建重复分数。我知道这不在您的问题范围内,但我想我会提到它,因为它显示了这种更加动态的解决方案如何使其更容易扩展。
我已经离开了上面的解决方案,就像你说的那样,这做了你想要的,我会让你消化它,但它很混乱,并且随着更多的标准会变得更加混乱。
CREATE TABLE [CleanedAddresses] (
ID INT IDENTITY(1, 1)
,[Address] [varchar](255) NOT NULL UNIQUE
,PRIMARY KEY (ID)
)
INSERT INTO [CleanedAddresses] ([Address])
VALUES ('100 W Balor Ave')
,('28 N US State Hwy 99')
,('365 Welmington Rd')
,('PO BOX 169')
,('PO Box 204')
,('PO Box 50221')
,('SouthWestern Medical100 W Balor Ave')
CREATE TABLE [sample_data] (
[Supplier_No] VARCHAR(255)
,[Name] VARCHAR(255)
,[Address] VARCHAR(255)
,[City] VARCHAR(255)
,[State] VARCHAR(255)
,[Zip] VARCHAR(255)
,[Country] VARCHAR(255)
,[Remit_Address] VARCHAR(255)
,[Remit_City] VARCHAR(255)
,[Remit_State] VARCHAR(255)
,[Remit_Zip] VARCHAR(255)
,[Remit_Country] VARCHAR(255)
,[cleanAddress] VARCHAR(255)
,[cleanRemit_Address] VARCHAR(255)
,CleanAddressID INT NULL CONSTRAINT [suppliers_pk] PRIMARY KEY ([Supplier_No])
,FOREIGN KEY (CleanAddressID) REFERENCES [CleanedAddresses](ID)
)
INSERT INTO [sample_data] (
[Supplier_No]
,[Name]
,[Address]
,[City]
,[State]
,[Zip]
,[Country]
,[Remit_Address]
,[Remit_City]
,[Remit_State]
,[Remit_Zip]
,[Remit_Country]
,[cleanAddress]
,[cleanRemit_Address]
)
VALUES (
'1039104'
,'Geez Companies'
,'100 Aero Hudson Rd'
,'Streetsboro'
,'OH'
,'44241'
,''
,'100 Aero Hudson Road'
,'Streetsboro'
,'OH'
,'44241'
,'USA'
,'100 Aero Hudson Rd'
,'100 Aero Hudson Rd'
)
,(
'1218409'
,'SouthWestern Medical'
,'100 West Balor Ave'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'SouthWestern Medical100 W Balor Ave'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'100 W Balor Ave'
,'SouthWestern Medical100 W Balor Ave'
)
,(
'1243789'
,'SouthWestern Medical'
,'100 West Balor Ave'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'SouthWestern Medical100 West Balor Ave'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'100 W Balor Ave'
,'SouthWestern Medical100 W Balor Ave'
)
,(
'1243636'
,'SIRI SYSTEMS'
,'15 BRAD ROAD'
,'WEXFORD'
,'PA'
,'15090'
,''
,''
,''
,''
,''
,''
,'15 BRAD RD'
,''
)
,(
'1152482'
,'FLEETWOOD MACK'
,'22 WINDSOCK CT'
,'ADDISON'
,'IL'
,'60101'
,''
,'PO BOX 951'
,'CHICAGO'
,'IL'
,'60694-5124'
,''
,'22 WINDSOCK CT'
,'PO BOX 951'
)
,(
'1224483'
,'Aerospace Junction'
,'211500 Communicate Ave'
,'Mingo Junction'
,'OH'
,'43939'
,'USA'
,'P O Box 99'
,'Mingo Junction'
,'OH'
,'43939'
,'USA'
,'211500 Communicate Ave'
,'PO Box 99'
)
,(
'1243397'
,'Squeezy Felt'
,'SCHREIBER DIST'
,'NEW KENSINGTON'
,'PA'
,'15068'
,''
,''
,''
,''
,''
,''
,'SCHREIBER DIST'
,''
)
,(
'1230895'
,'NERO CO'
,'28 North US State Highway 99'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'PO Box 204'
,'Cape Girardeau'
,'MO'
,'63702-2045'
,'USA'
,'28 N US State Hwy 99'
,'PO Box 204'
)
,(
'1243782'
,'NERO CO'
,'28 North US State Highway 99'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'PO Box 204'
,'Cape Girardeau'
,'MO'
,'63702-2045'
,'USA'
,'28 N US State Hwy 99'
,'PO Box 204'
)
,(
'1135880'
,'RICHARD PRYOR SEMINARS'
,'PO BOX 2194'
,'KANSAS CITY'
,'MO'
,'64121-9468'
,'USA'
,'RICHARD PRYOR SEMINARS P O BOX 2194'
,'KANSAS CITY'
,'MO'
,'64121-9468'
,'USA'
,'PO BOX 2194'
,'RICHARD PRYOR SEMINARS PO BOX 2194'
)
,(
'1241328'
,'INFINITY AND BEYOND'
,'P.O. BOX 169'
,'GASTONIA'
,'NC'
,'28053-0269'
,'USA'
,''
,''
,''
,''
,''
,'PO BOX 169'
,''
)
,(
'1259522'
,'ZEEBO INC'
,'PO BOX 169'
,'GASSTONIA'
,'NC'
,'28053-0269'
,'USA'
,''
,''
,''
,''
,''
,'PO BOX 169'
,''
)
,(
'1255253'
,'AT&T'
,'PO Box 50221'
,'Carol Stream'
,'IL'
,'60197'
,'USA'
,''
,''
,''
,''
,''
,'PO Box 50221'
,''
)
,(
'1135513'
,'AT&T'
,'PO Box 50221'
,'Carol Stream'
,'IL'
,'60197-5080'
,'USA'
,''
,''
,''
,''
,''
,'PO Box 50221'
,''
)
,(
'1119161'
,'Machine Co, Inc'
,'3306 N Thorne Blvd'
,'Chattanooga'
,'TN'
,''
,''
,'PO BOX 5301'
,'CHATTANOOGA'
,'TN'
,'37406'
,'USA'
,'3306 N Thorne Blvd'
,'PO BOX 5301'
)
,(
'1176587'
,'Topsy Turvy'
,'365 Welmington Road'
,'Chicago'
,'IL'
,'60606'
,'USA'
,''
,''
,''
,''
,''
,'365 Welmington Rd'
,''
)
,(
'2156671'
,'Topsy Turvvy, Inc.'
,'P.O. Box 55217'
,'Columbus'
,'OH'
,'43081'
,''
,'365 Welmington Road'
,'Chicago'
,'IL'
,'60606'
,'USA'
,''
,'365 Welmington Rd'
)
UPDATE S
SET CleanAddressID = c.ID
FROM Sample_data S
INNER JOIN CleanedAddresses C ON c.Address = s.cleanAddress
UPDATE S
SET CleanAddressID = c.ID
FROM Sample_data S
INNER JOIN CleanedAddresses C ON c.Address = s.cleanRemit_Address
WHERE s.CleanAddressID IS NULL
SELECT *
FROM Sample_data S
WHERE CleanAddressID IS NOT NULL
AND cleanAddressID IN (
SELECT s2.cleanAddressID
FROM sample_data s2
GROUP BY s2.cleanAddressID
HAVING count(*) > 1
)
ORDER BY CleanAddressID
首先,您可以通过在 on clause
中提及两个条件来避免使用昂贵的 union
,如下所示:
ON
(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] <> '' AND [Address] IS NOT NULL ) )
or
(n.[NewAdd] = pv.[cleanRemit_Address] AND ( [Remit_Address] <> '' AND [Remit_Address] IS NOT NULL) )
然后您可以在 row_number()over() window 函数的帮助下删除每个 supplier_no 的重复行。
然后根据它们的地址对这些行进行排名
dense_rank()over(order by case when(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] < '' AND [Address] IS NOT NULL ) )
then cleanaddress else Remit_Address end)
但是我不明白你是怎么组成四排的第 4 组的。
查询:
with cte
as
(
SELECT
'Address Match' AS [Reason],
pv.[Supplier_No],
pv.[Name],
pv.[Address],
pv.[City],
pv.[State],
pv.[Zip],
pv.[Country],
pv.[Remit_Address],
pv.[Remit_City],
pv.[Remit_State],
pv.[Remit_Zip],
pv.[Remit_Country],
row_number()over (partition by supplier_no order by address ,remit_address )rn,
dense_rank()over(order by case when(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] < '' AND [Address] IS NOT NULL ) )
then cleanaddress else Remit_Address end) rnk
FROM [dupe_addresses] n
LEFT JOIN [sample_data] pv
ON
(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] < '' AND [Address] IS NOT NULL ) )
or
(n.[NewAdd] = pv.[cleanRemit_Address] AND ( [Remit_Address] < '' AND [Remit_Address] IS NOT NULL) )
WHERE [Supplier_No] IS NOT NULL AND [Supplier_No] < ''
)
select * from cte where rn=1
order by rnk desc
Reason
Supplier_No
Name
Address
City
State
Zip
Country
Remit_Address
Remit_City
Remit_State
Remit_Zip
Remit_Country
rn
rnk
Address Match
1135513
AT&T
PO Box 50221
Carol Stream
IL
60197-5080
USA
1
7
Address Match
1255253
AT&T
PO Box 50221
Carol Stream
IL
60197
USA
1
7
Address Match
1259522
ZEEBO INC
PO BOX 169
GASSTONIA
NC
28053-0269
USA
1
5
Address Match
1241328
INFINITY AND BEYOND
P.O. BOX 169
GASTONIA
NC
28053-0269
USA
1
5
Address Match
2156671
Topsy Turvvy, Inc.
P.O. Box 55217
Columbus
OH
43081
365 Welmington Road
Chicago
IL
60606
USA
1
4
Address Match
1176587
Topsy Turvy
365 Welmington Road
Chicago
IL
60606
USA
1
3
Address Match
1230895
NERO CO
28 North US State Highway 99
Osceola
AR
72370
USA
PO Box 204
Cape Girardeau
MO
63702-2045
USA
1
2
Address Match
1243782
NERO CO
28 North US State Highway 99
Osceola
AR
72370
USA
PO Box 204
Cape Girardeau
MO
63702-2045
USA
1
2
Address Match
1218409
SouthWestern Medical
100 West Balor Ave
Osceola
AR
72370
USA
SouthWestern Medical100 W Balor Ave
Osceola
AR
72370
USA
1
1
Address Match
1243789
SouthWestern Medical
100 West Balor Ave
Osceola
AR
72370
USA
SouthWestern Medical100 West Balor Ave
Osceola
AR
72370
USA
1
1
dbhere
长话短说。我有数据试图通过地址识别重复记录。地址可以在 [Address]
或 [Remit_Address]
字段上匹配。
我使用 JOIN 和 UNION 来获取记录,但我需要匹配的记录在结果中相互显示。
我无法按任何现有字段排序,因此典型的 'ORDER BY' 不起作用。我按照某人的建议查看了 RANK
,看起来它可能有效,但我不知道如何进行分区,我认为该命令给我带来了与 ORDER BY
相同的问题。
如果 RANK 不是最佳选择,我愿意接受其他想法。最终目标是以某种方式对匹配的记录进行分组。
- SSMS 18
- SQL 服务器 2019
设置如下:
-- Output Table
CREATE TABLE [dupecheck] (
[id] int identity(1, 1),
[Data Area] varchar(255),
[Supplier_No] varchar(255),
[Name] varchar(255),
[Address] varchar(255),
[City] varchar(255),
[State] varchar(255),
[Zip] varchar(255),
[Country] varchar(255),
[Remit_Address] varchar(255),
[Remit_City] varchar(255),
[Remit_State] varchar(255),
[Remit_Zip] varchar(255),
[Remit_Country] varchar(255),
)
CREATE TABLE [sample_data] (
[Supplier_No] varchar(255),
[Name] varchar(255),
[Address] varchar(255),
[City] varchar(255),
[State] varchar(255),
[Zip] varchar(255),
[Country] varchar(255),
[Remit_Address] varchar(255),
[Remit_City] varchar(255),
[Remit_State] varchar(255),
[Remit_Zip] varchar(255),
[Remit_Country] varchar(255),
[cleanAddress] varchar(255),
[cleanRemit_Address] varchar(255),
CONSTRAINT [suppliers_pk] PRIMARY KEY ([Supplier_No])
)
INSERT INTO [sample_data] VALUES
('1039104','Geez Companies','100 Aero Hudson Rd','Streetsboro','OH','44241','','100 Aero Hudson Road','Streetsboro','OH','44241','USA','100 Aero Hudson Rd','100 Aero Hudson Rd'),
('1218409','SouthWestern Medical','100 West Balor Ave','Osceola','AR','72370','USA','SouthWestern Medical100 W Balor Ave','Osceola','AR','72370','USA','100 W Balor Ave','SouthWestern Medical100 W Balor Ave'),
('1243789','SouthWestern Medical','100 West Balor Ave','Osceola','AR','72370','USA','SouthWestern Medical100 West Balor Ave','Osceola','AR','72370','USA','100 W Balor Ave','SouthWestern Medical100 W Balor Ave'),
('1243636','SIRI SYSTEMS','15 BRAD ROAD','WEXFORD','PA','15090','','','','','','','15 BRAD RD',''),
('1152482','FLEETWOOD MACK','22 WINDSOCK CT','ADDISON','IL','60101','','PO BOX 951','CHICAGO','IL','60694-5124','','22 WINDSOCK CT','PO BOX 951'),
('1224483','Aerospace Junction','211500 Communicate Ave','Mingo Junction','OH','43939','USA','P O Box 99','Mingo Junction','OH','43939','USA','211500 Communicate Ave','PO Box 99'),
('1243397','Squeezy Felt','SCHREIBER DIST','NEW KENSINGTON','PA','15068','','','','','','','SCHREIBER DIST',''),
('1230895','NERO CO','28 North US State Highway 99','Osceola','AR','72370','USA','PO Box 204','Cape Girardeau','MO','63702-2045','USA','28 N US State Hwy 99','PO Box 204'),
('1243782','NERO CO','28 North US State Highway 99','Osceola','AR','72370','USA','PO Box 204','Cape Girardeau','MO','63702-2045','USA','28 N US State Hwy 99','PO Box 204'),
('1135880','RICHARD PRYOR SEMINARS','PO BOX 2194','KANSAS CITY','MO','64121-9468','USA','RICHARD PRYOR SEMINARS P O BOX 2194','KANSAS CITY','MO','64121-9468','USA','PO BOX 2194','RICHARD PRYOR SEMINARS PO BOX 2194'),
('1241328','INFINITY AND BEYOND','P.O. BOX 169','GASTONIA','NC','28053-0269','USA','','','','','','PO BOX 169',''),
('1259522','ZEEBO INC','PO BOX 169','GASSTONIA','NC','28053-0269','USA','','','','','','PO BOX 169',''),
('1255253','AT&T','PO Box 50221','Carol Stream','IL','60197','USA','','','','','','PO Box 50221',''),
('1135513','AT&T','PO Box 50221','Carol Stream','IL','60197-5080','USA','','','','','','PO Box 50221',''),
('1119161','Machine Co, Inc','3306 N Thorne Blvd','Chattanooga','TN','','','PO BOX 5301','CHATTANOOGA','TN','37406','USA','3306 N Thorne Blvd','PO BOX 5301'),
('1176587','Topsy Turvy','365 Welmington Road','Chicago','IL','60606','USA','','','','','','365 Welmington Rd',''),
('2156671','Topsy Turvvy, Inc.','P.O. Box 55217','Columbus','OH','43081','','365 Welmington Road','Chicago','IL','60606','USA','','365 Welmington Rd')
CREATE TABLE [dupe_addresses](
[NewAdd] [varchar](255) NULL
)
INSERT INTO [dupe_addresses] VALUES
('100 W Balor Ave'),
('28 N US State Hwy 99'),
('365 Welmington Rd'),
('PO BOX 169'),
('PO Box 204'),
('PO Box 50221'),
('SouthWestern Medical100 W Balor Ave')
现有查询:
INSERT INTO [dupecheck]
SELECT * FROM (
SELECT
'Address Match' AS [Reason],
pv.[Supplier_No],
pv.[Name],
pv.[Address],
pv.[City],
pv.[State],
pv.[Zip],
pv.[Country],
pv.[Remit_Address],
pv.[Remit_City],
pv.[Remit_State],
pv.[Remit_Zip],
pv.[Remit_Country]
FROM [dupe_addresses] n
LEFT JOIN [sample_data] pv
ON
(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] <> '' AND [Address] IS NOT NULL ) )
WHERE ([Supplier_No] IS NOT NULL AND [Supplier_No] <> '')
UNION
SELECT
'Address Match' AS [Reason],
pv.[Supplier_No],
pv.[Name],
pv.[Address],
pv.[City],
pv.[State],
pv.[Zip],
pv.[Country],
pv.[Remit_Address],
pv.[Remit_City],
pv.[Remit_State],
pv.[Remit_Zip],
pv.[Remit_Country]
FROM [dupe_addresses] n
LEFT JOIN [sample_data] pv
ON
(n.[NewAdd] = pv.[cleanRemit_Address] AND ( [Remit_Address] <> '' AND [Remit_Address] IS NOT NULL) )
WHERE ([Supplier_No] IS NOT NULL AND [Supplier_No] <> '')
) q1
当前结果:
Reason Supplier_No Name Address City State Zip Country Remit_Address Remit_City Remit_State Remit_Zip Remit_Country
Address Match 1135513 AT&T PO Box 50221 Carol Stream IL 60197-5080 USA
Address Match 1176587 Topsy Turvy 365 Welmington Road Chicago IL 60606 USA
Address Match 1218409 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 W Balor Ave Osceola AR 72370 USA
Address Match 1230895 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA
Address Match 1241328 INFINITY AND BEYOND P.O. BOX 169 GASTONIA NC 28053-0269 USA
Address Match 1243782 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA
Address Match 1243789 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 West Balor Ave Osceola AR 72370 USA
Address Match 1255253 AT&T PO Box 50221 Carol Stream IL 60197 USA
Address Match 1259522 ZEEBO INC PO BOX 169 GASSTONIA NC 28053-0269 USA
Address Match 2156671 Topsy Turvvy, Inc. P.O. Box 55217 Columbus OH 43081 365 Welmington Road Chicago IL 60606 USA
期望的结果:
Reason Supplier_No Name Address City State Zip Country Remit_Address Remit_City Remit_State Remit_Zip Remit_Country rank
Address Match 1135513 AT&T PO Box 50221 Carol Stream IL 60197-5080 USA 1
Address Match 1255253 AT&T PO Box 50221 Carol Stream IL 60197 USA 1
Address Match 1241328 INFINITY AND BEYOND P.O. BOX 169 GASTONIA NC 28053-0269 USA 2
Address Match 1259522 ZEEBO INC PO BOX 169 GASSTONIA NC 28053-0269 USA 2
Address Match 1243782 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA 3
Address Match 1230895 NERO CO 28 North US State Highway 99 Osceola AR 72370 USA PO Box 204 Cape Girardeau MO 63702-2045 USA 3
Address Match 1218409 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 W Balor Ave Osceola AR 72370 USA 4
Address Match 1243789 SouthWestern Medical 100 West Balor Ave Osceola AR 72370 USA SouthWestern Medical100 West Balor Ave Osceola AR 72370 USA 4
Address Match 2156671 Topsy Turvvy, Inc. P.O. Box 55217 Columbus OH 43081 365 Welmington Road Chicago IL 60606 USA 5
Address Match 1176587 Topsy Turvy 365 Welmington Road Chicago IL 60606 USA 5
此查询创建了所需的结果。
with cte as (
select s2.NewAdd grp, s1.*
, rank() over(partition by Supplier_No order by s2.NewAdd) rnk
from sample_data s1
inner join dupe_addresses s2 on
(s1.cleanAddress=s2.newAdd) or (s1.cleanRemit_Address=s2.newAdd)
)
select c1.*
from cte c1
where rnk = 1
order by c1.grp
删除了 Union ,并改为通过 OR
组合两个连接条件。
所以可能会找到符合这两个条件的记录。
rank()
用于计算结果集分区内每一行的排名。
partition by Supplier_No
用于识别重复记录。
最后用where rnk = 1
查看不重复的记录组
我确定有一种 shorter/cleaner 方法可以做到这一点,但是当我等待我的咖啡开始时,下面应该做你想做的。
SELECT s1.*
,coalesce((
SELECT s1.Cleanaddress
FROM dupe_addresses s2
WHERE s1.cleanAddress = s2.newAdd
), (
SELECT s1.cleanRemit_Address
FROM dupe_addresses s2
WHERE s1.cleanRemit_Address = s2.newAdd
)) AS MatchedAddress
FROM sample_data s1
WHERE EXISTS (
SELECT 1
FROM dupe_addresses s2
WHERE (s1.cleanAddress = s2.newAdd)
OR (s1.cleanRemit_Address = s2.newAdd)
)
ORDER BY MatchedAddress
编辑:我考虑得更多了。我会改变你这样做的方式,因为你说你有更多的匹配标准,这将是实现你想要的更好的方式。基本上我会在你的 supplier/data table 上创建一个 CleanedAddressID,然后将你所有清理过的地址放入一个清理过的地址 table.
完成后,您可以更新 CleanedAddressID,并且可以使用比当前使用更多的criteria/matching。
以下代码应该可以帮到您,最终查询将 return 所有基于地址的重复项。
随着时间的推移,您可以用类似的方式添加不同的匹配项,然后创建重复分数。我知道这不在您的问题范围内,但我想我会提到它,因为它显示了这种更加动态的解决方案如何使其更容易扩展。
我已经离开了上面的解决方案,就像你说的那样,这做了你想要的,我会让你消化它,但它很混乱,并且随着更多的标准会变得更加混乱。
CREATE TABLE [CleanedAddresses] (
ID INT IDENTITY(1, 1)
,[Address] [varchar](255) NOT NULL UNIQUE
,PRIMARY KEY (ID)
)
INSERT INTO [CleanedAddresses] ([Address])
VALUES ('100 W Balor Ave')
,('28 N US State Hwy 99')
,('365 Welmington Rd')
,('PO BOX 169')
,('PO Box 204')
,('PO Box 50221')
,('SouthWestern Medical100 W Balor Ave')
CREATE TABLE [sample_data] (
[Supplier_No] VARCHAR(255)
,[Name] VARCHAR(255)
,[Address] VARCHAR(255)
,[City] VARCHAR(255)
,[State] VARCHAR(255)
,[Zip] VARCHAR(255)
,[Country] VARCHAR(255)
,[Remit_Address] VARCHAR(255)
,[Remit_City] VARCHAR(255)
,[Remit_State] VARCHAR(255)
,[Remit_Zip] VARCHAR(255)
,[Remit_Country] VARCHAR(255)
,[cleanAddress] VARCHAR(255)
,[cleanRemit_Address] VARCHAR(255)
,CleanAddressID INT NULL CONSTRAINT [suppliers_pk] PRIMARY KEY ([Supplier_No])
,FOREIGN KEY (CleanAddressID) REFERENCES [CleanedAddresses](ID)
)
INSERT INTO [sample_data] (
[Supplier_No]
,[Name]
,[Address]
,[City]
,[State]
,[Zip]
,[Country]
,[Remit_Address]
,[Remit_City]
,[Remit_State]
,[Remit_Zip]
,[Remit_Country]
,[cleanAddress]
,[cleanRemit_Address]
)
VALUES (
'1039104'
,'Geez Companies'
,'100 Aero Hudson Rd'
,'Streetsboro'
,'OH'
,'44241'
,''
,'100 Aero Hudson Road'
,'Streetsboro'
,'OH'
,'44241'
,'USA'
,'100 Aero Hudson Rd'
,'100 Aero Hudson Rd'
)
,(
'1218409'
,'SouthWestern Medical'
,'100 West Balor Ave'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'SouthWestern Medical100 W Balor Ave'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'100 W Balor Ave'
,'SouthWestern Medical100 W Balor Ave'
)
,(
'1243789'
,'SouthWestern Medical'
,'100 West Balor Ave'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'SouthWestern Medical100 West Balor Ave'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'100 W Balor Ave'
,'SouthWestern Medical100 W Balor Ave'
)
,(
'1243636'
,'SIRI SYSTEMS'
,'15 BRAD ROAD'
,'WEXFORD'
,'PA'
,'15090'
,''
,''
,''
,''
,''
,''
,'15 BRAD RD'
,''
)
,(
'1152482'
,'FLEETWOOD MACK'
,'22 WINDSOCK CT'
,'ADDISON'
,'IL'
,'60101'
,''
,'PO BOX 951'
,'CHICAGO'
,'IL'
,'60694-5124'
,''
,'22 WINDSOCK CT'
,'PO BOX 951'
)
,(
'1224483'
,'Aerospace Junction'
,'211500 Communicate Ave'
,'Mingo Junction'
,'OH'
,'43939'
,'USA'
,'P O Box 99'
,'Mingo Junction'
,'OH'
,'43939'
,'USA'
,'211500 Communicate Ave'
,'PO Box 99'
)
,(
'1243397'
,'Squeezy Felt'
,'SCHREIBER DIST'
,'NEW KENSINGTON'
,'PA'
,'15068'
,''
,''
,''
,''
,''
,''
,'SCHREIBER DIST'
,''
)
,(
'1230895'
,'NERO CO'
,'28 North US State Highway 99'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'PO Box 204'
,'Cape Girardeau'
,'MO'
,'63702-2045'
,'USA'
,'28 N US State Hwy 99'
,'PO Box 204'
)
,(
'1243782'
,'NERO CO'
,'28 North US State Highway 99'
,'Osceola'
,'AR'
,'72370'
,'USA'
,'PO Box 204'
,'Cape Girardeau'
,'MO'
,'63702-2045'
,'USA'
,'28 N US State Hwy 99'
,'PO Box 204'
)
,(
'1135880'
,'RICHARD PRYOR SEMINARS'
,'PO BOX 2194'
,'KANSAS CITY'
,'MO'
,'64121-9468'
,'USA'
,'RICHARD PRYOR SEMINARS P O BOX 2194'
,'KANSAS CITY'
,'MO'
,'64121-9468'
,'USA'
,'PO BOX 2194'
,'RICHARD PRYOR SEMINARS PO BOX 2194'
)
,(
'1241328'
,'INFINITY AND BEYOND'
,'P.O. BOX 169'
,'GASTONIA'
,'NC'
,'28053-0269'
,'USA'
,''
,''
,''
,''
,''
,'PO BOX 169'
,''
)
,(
'1259522'
,'ZEEBO INC'
,'PO BOX 169'
,'GASSTONIA'
,'NC'
,'28053-0269'
,'USA'
,''
,''
,''
,''
,''
,'PO BOX 169'
,''
)
,(
'1255253'
,'AT&T'
,'PO Box 50221'
,'Carol Stream'
,'IL'
,'60197'
,'USA'
,''
,''
,''
,''
,''
,'PO Box 50221'
,''
)
,(
'1135513'
,'AT&T'
,'PO Box 50221'
,'Carol Stream'
,'IL'
,'60197-5080'
,'USA'
,''
,''
,''
,''
,''
,'PO Box 50221'
,''
)
,(
'1119161'
,'Machine Co, Inc'
,'3306 N Thorne Blvd'
,'Chattanooga'
,'TN'
,''
,''
,'PO BOX 5301'
,'CHATTANOOGA'
,'TN'
,'37406'
,'USA'
,'3306 N Thorne Blvd'
,'PO BOX 5301'
)
,(
'1176587'
,'Topsy Turvy'
,'365 Welmington Road'
,'Chicago'
,'IL'
,'60606'
,'USA'
,''
,''
,''
,''
,''
,'365 Welmington Rd'
,''
)
,(
'2156671'
,'Topsy Turvvy, Inc.'
,'P.O. Box 55217'
,'Columbus'
,'OH'
,'43081'
,''
,'365 Welmington Road'
,'Chicago'
,'IL'
,'60606'
,'USA'
,''
,'365 Welmington Rd'
)
UPDATE S
SET CleanAddressID = c.ID
FROM Sample_data S
INNER JOIN CleanedAddresses C ON c.Address = s.cleanAddress
UPDATE S
SET CleanAddressID = c.ID
FROM Sample_data S
INNER JOIN CleanedAddresses C ON c.Address = s.cleanRemit_Address
WHERE s.CleanAddressID IS NULL
SELECT *
FROM Sample_data S
WHERE CleanAddressID IS NOT NULL
AND cleanAddressID IN (
SELECT s2.cleanAddressID
FROM sample_data s2
GROUP BY s2.cleanAddressID
HAVING count(*) > 1
)
ORDER BY CleanAddressID
首先,您可以通过在 on clause
中提及两个条件来避免使用昂贵的 union
,如下所示:
ON
(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] <> '' AND [Address] IS NOT NULL ) )
or
(n.[NewAdd] = pv.[cleanRemit_Address] AND ( [Remit_Address] <> '' AND [Remit_Address] IS NOT NULL) )
然后您可以在 row_number()over() window 函数的帮助下删除每个 supplier_no 的重复行。
然后根据它们的地址对这些行进行排名
dense_rank()over(order by case when(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] < '' AND [Address] IS NOT NULL ) )
then cleanaddress else Remit_Address end)
但是我不明白你是怎么组成四排的第 4 组的。
查询:
with cte
as
(
SELECT
'Address Match' AS [Reason],
pv.[Supplier_No],
pv.[Name],
pv.[Address],
pv.[City],
pv.[State],
pv.[Zip],
pv.[Country],
pv.[Remit_Address],
pv.[Remit_City],
pv.[Remit_State],
pv.[Remit_Zip],
pv.[Remit_Country],
row_number()over (partition by supplier_no order by address ,remit_address )rn,
dense_rank()over(order by case when(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] < '' AND [Address] IS NOT NULL ) )
then cleanaddress else Remit_Address end) rnk
FROM [dupe_addresses] n
LEFT JOIN [sample_data] pv
ON
(n.[NewAdd] = pv.[cleanAddress] AND ( [Address] < '' AND [Address] IS NOT NULL ) )
or
(n.[NewAdd] = pv.[cleanRemit_Address] AND ( [Remit_Address] < '' AND [Remit_Address] IS NOT NULL) )
WHERE [Supplier_No] IS NOT NULL AND [Supplier_No] < ''
)
select * from cte where rn=1
order by rnk desc
Reason | Supplier_No | Name | Address | City | State | Zip | Country | Remit_Address | Remit_City | Remit_State | Remit_Zip | Remit_Country | rn | rnk |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Address Match | 1135513 | AT&T | PO Box 50221 | Carol Stream | IL | 60197-5080 | USA | 1 | 7 | |||||
Address Match | 1255253 | AT&T | PO Box 50221 | Carol Stream | IL | 60197 | USA | 1 | 7 | |||||
Address Match | 1259522 | ZEEBO INC | PO BOX 169 | GASSTONIA | NC | 28053-0269 | USA | 1 | 5 | |||||
Address Match | 1241328 | INFINITY AND BEYOND | P.O. BOX 169 | GASTONIA | NC | 28053-0269 | USA | 1 | 5 | |||||
Address Match | 2156671 | Topsy Turvvy, Inc. | P.O. Box 55217 | Columbus | OH | 43081 | 365 Welmington Road | Chicago | IL | 60606 | USA | 1 | 4 | |
Address Match | 1176587 | Topsy Turvy | 365 Welmington Road | Chicago | IL | 60606 | USA | 1 | 3 | |||||
Address Match | 1230895 | NERO CO | 28 North US State Highway 99 | Osceola | AR | 72370 | USA | PO Box 204 | Cape Girardeau | MO | 63702-2045 | USA | 1 | 2 |
Address Match | 1243782 | NERO CO | 28 North US State Highway 99 | Osceola | AR | 72370 | USA | PO Box 204 | Cape Girardeau | MO | 63702-2045 | USA | 1 | 2 |
Address Match | 1218409 | SouthWestern Medical | 100 West Balor Ave | Osceola | AR | 72370 | USA | SouthWestern Medical100 W Balor Ave | Osceola | AR | 72370 | USA | 1 | 1 |
Address Match | 1243789 | SouthWestern Medical | 100 West Balor Ave | Osceola | AR | 72370 | USA | SouthWestern Medical100 West Balor Ave | Osceola | AR | 72370 | USA | 1 | 1 |
db