使用 Group By、Rank Row_Number 删除重复项
Duplicates removal using Group By, Rank, Row_Number
我有两个 table。一个是 CustomerOrders,另一个是 OrderCustomerRef - lookup table。
两个 table 都具有一对多关系 - 一个客户可能与多个订单相关联。
CustomerOrders table 有重复的客户(相同的 LName、FName、Email)。但是他们有不同的Cust_IDs.
我需要合并基本客户 table 中的所有重复联系人(一对一)。 (此 table 未在此处显示)。
第 1 步:
需要找出哪个 Cust_ID 应该合并到哪个对应的重复客户(相同的 LName、FName、Email)中。具有最新 Order_Date 的联系人应该赢得其对应的重复对方(客户)。 VIP 客户除外 - 无论 Order_Date.
,他们应该始终是获胜者
第 2 步:
更新了 OrderCustomerRef table:用获胜的 Cust_IDs 替换所有失败的副本 Cust_IDs。
第 3 步:
从基本客户 table 中删除所有丢失的联系人(当前范围内没有。我会自己做)。
IF OBJECT_ID('tempdb..#table') IS NOT NULL
DROP TABLE #table;
IF OBJECT_ID('tempdb..#CustomerOrders') IS NOT NULL
DROP TABLE #CustomerOrders;
IF OBJECT_ID('tempdb..#OrderCustomerRef') IS NOT NULL
DROP TABLE #OrderCustomerRef;
CREATE TABLE #CustomerOrders
(
[PK_ID] INT NOT NULL PRIMARY KEY IDENTITY(1,1),
Cust_ID INT NOT NULL,
LName VARCHAR(100) NULL,
FName VARCHAR(100) NULL,
[Customer_E-mail] VARCHAR(100) NULL,
Order_Date DATETIME NULL,
Customer_Source VARCHAR(100) NULL,
CustomerType VARCHAR(100) NULL
)
INSERT INTO #CustomerOrders (Cust_ID, LName, FName, [Customer_E-mail], Order_Date, Customer_Source, CustomerType)
VALUES
(1, 'John', 'Smith', 'JSmith@email.com', '2018-11-10 01:40:55.150', 'XYZ Company', 'Regular'),
(2, 'John', 'Smith', 'JSmith@email.com', '2018-10-10 05:05:55.150', 'Internet', 'VIP'),
(3, 'Adam', 'Burns', 'ABurns@email.com', '2017-05-05 00:00:00.000', 'XYZ Company','Regular'),
(3, 'Adam', 'Burns', 'ABurns@email.com', '2017-05-05 00:00:00.000', 'XYZ Company','VIP'),
(4, 'Adam', 'Burns', 'ABurns@email.com', '2017-05-05 00:00:00.000', 'Internet','Regular'),
(5, 'Adam', 'Burns', 'ABurns@email.com', '2017-05-05 00:00:00.000', 'Internet','VIP'),
(6, 'James', 'Snatcher', 'JSnatcher@email.com', '2019-07-07 00:00:00.000', 'XYZ Company', 'Regular'),
(7, 'James', 'Snatcher', 'JSnatcher@email.com', '2019-07-07 00:00:00.000', 'Internet','Regular'),
(9, 'Thomas', 'Johnson', 'TJohnson@email.com', '2016-05-01 00:00:00.000', 'Internet','Regular'),
(9, 'Thomas', 'Johnson', 'TJohnson@email.com', '2015-04-01 00:00:00.000', 'Internet','Regular'),
(10, 'Thomas', 'Johnson', 'TJohnson@email.com', '2014-03-01 00:00:00.000', 'Internet','Regular'),
(11, 'Thomas', 'Johnson', 'TJohnson@email.com', '2013-02-01 00:00:00.000', 'XYZ Company','Regular'),
(12, 'Peter', 'McDonald', 'PMcDonald@email.com', '2013-02-01 00:00:00.000', 'XYZ Company','Regular'),
(13, 'Jose', 'Mainster', 'JMainster@email.com', '2013-02-01 00:00:00.000', 'Internet','Regular'),
(14, 'Kevin', 'Digginton', 'KDigginton@email.com', '2013-02-01 00:00:00.000', 'Internet','Regular'),
(14, 'Kevin', 'Digginton', 'KDigginton@email.com', '2015-09-03 00:00:00.000', 'Internet','Regular')
CREATE TABLE #OrderCustomerRef
(
Raw_PK INT NOT NULL PRIMARY KEY IDENTITY(1,1),
OrderID INT NOT NULL,
Cust_ID INT NULL,
OrderType VARCHAR(100) NULL
)
INSERT INTO #OrderCustomerRef (OrderID, Cust_ID, OrderType)
VALUES
(1,1,'Online'),
(2,2,'Online'),
(3,3,'Online'),
(4,3,'Online'),
(5,4,'In Store'),
(6,5,'Online'),
(7,6,'Online'),
(8,7,'In Store'),
(9,9,'Online'),
(10,9,'Online'),
(11,10,'In Store'),
(12,11,'Online'),
(13,12,'Online'),
(14,13,'Online'),
(15,14,'Online'),
(16,14,'In Store')
-- SELECT * FROM #OrderCustomerRef
SELECT *,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail], Customer_Source ORDER BY Order_Date DESC) AS Rank_1,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail], Customer_Source ORDER BY Order_Date, CustomerType DESC ) AS Rank_CustType,
RANK() OVER (PARTITION BY Cust_ID, FName, LName, [Customer_E-mail], Customer_Source ORDER BY Order_Date, CustomerType DESC ) AS Rank_CustID,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail] ORDER BY Order_Date DESC) AS Rank_2,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail] ORDER BY Cust_ID) AS Rank_3
FROM #CustomerOrders
所需的输出应如下所示:
*异常:
- 丢失客户 ID 1、3(应该是赢的,但由于有重复的对应方,它是 VIP,它正在丢失)
- 中奖客户 ID 2、5(因为是 VIP,有例外)
例如:##OrderCustomerRef 中所有出现的 Cust_ID of John Smith 和 Cust_ID of 1 都应替换为 John Smith with Cust_ID of 2,所有出现的 Cust_ID of Adam Burns with Cust_ID of 3 should be replaced with Adam Burns with Cust_ID of 5
一般规则:
- 丢失客户 ID 7、10、11、4
- 获胜客户 ID 6、9、12、13、14
例如:##OrderCustomerRef 中所有出现的 7 的 Cust_ID 应替换为 6,所有出现的 10 的 Cust_ID 应替换为 9*
最终我应该在 ##OrderCustomerRef table
中只有客户 ID 6、9、12、13、14、2、5
使用 Rank_CustType_1、column_1、column_2 我可以弄清楚第 1 步。
但是我在第 2 步中仍然遇到问题 - 更新 OrderCustomerRef table:所有失败的 Cust_IDs 应该替换为相应的重复获胜 Cust_IDs.
我试过了。但这仍然不能取代失去 Cust_ID。
SELECT *,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail] ORDER BY Order_Date, CustomerType DESC) AS Rank_CustType_1,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail] ORDER BY Cust_ID) AS Rank_3
INTO #table
FROM #CustomerOrders
; with cte as (
select Cust_ID, FName, LName, [Customer_E-mail], max(t.Rank_CustType_1) as Rank_CustType_1
,(select distinct Cust_ID from #table a where a.Cust_ID = t.Cust_ID and Rank_3 = 1) column_1
,(select distinct Cust_ID from #table a where a.Cust_ID = t.Cust_ID and Rank_3 <> 1) column_2
from #table t
group by Cust_ID, FName, LName, [Customer_E-mail]
)
update b
set Cust_ID = case
when b.Cust_ID = cte.Cust_ID and
b.Cust_ID = ISNULL(cte.column_1,'') and Rank_CustType_1 != 1 then b.Cust_ID
when b.Cust_ID = cte.Cust_ID and
b.Cust_ID = ISNULL(cte.column_2,'') and Rank_CustType_1 != 1 then cte.column_2
when b.Cust_ID = cte.Cust_ID and Rank_CustType_1 = 1 and cte.column_1 is null and cte.column_2 is not null then cte.column_2
when b.Cust_ID = cte.Cust_ID and Rank_CustType_1 = 1 and cte.column_1 is not null and cte.column_2 is null then cte.column_1
end
from #OrderCustomerRef b
inner join cte on b.Cust_ID = cte.Cust_ID;
select * from #OrderCustomerRef;
根据您提供的信息,我使用了以下 CTE 来显示您想要的结果:
WITH DaCTE -- To rank the existing rows
AS (
SELECT pk_ID
, cust_ID
, fname
, lname
, [customer_e-mail]
, Order_Date
, Customer_Source
, customertype
, ROW_NUMBER() OVER (PARTITION BY fname, lname, [customer_e-mail] ORDER BY customertype DESC, order_date DESC, cust_id) as RankYo -- Orders by the criteria provided but while you suggested 3 should lose to 5, they have the same criteria so either one could win based on ordering
FROM #customerorders
)
, NewSource -- To show winning Customer ID next to Original ID
AS (
SELECT co.pk_ID
, DaCTE.cust_ID as NewCustomerID
, co.cust_ID as OriginalCustomerID
, co.fname
, co.lname
, co.[customer_e-mail]
, co.Order_Date
, co.Customer_Source
, co.customertype
FROM DaCTE
INNER JOIN #CustomerOrders as co
ON co.fname = DaCTE.FName
AND co.lname = DaCTE.LName
AND co.[customer_e-mail] = DaCTE.[Customer_E-mail]
WHERE DaCTE.RankYo = 1 -- filter to show only the winning IDs based on resulting rank from previous CTE
)
SELECT *
/*UPDATE ocr --commented out so you can see the results before running update
SET ocr.Cust_ID = ns.NewCustomerID*/
FROM #OrderCustomerRef as ocr
INNER JOIN NewSource as ns
ON ns.OriginalCustomerID = ocr.Cust_ID
您可以使用 CTE(通用 Table 表达式)尝试此操作,如上述答案中 Antoine Hernandez 所解释的那样,您还可以使用 UNION 和 EXCEPT 运算符从 table 中删除重复项。
i.g 使用 EXCEPT 运算符
SELECT * FROM #customerorders
EXCEPT
SELECT * FROM #customerorders WHERE 1=0
i.g 使用 UNION 运算符
SELECT * FROM #customerorders
UNION
SELECT * FROM #customerorders WHERE 1=0
有关如何使用 CTE 删除重复项的更多信息,请遵循此 link:Remove Duplicates Using CTE
有关如何使用 UNION 和 EXCEPT 运算符删除重复项的更多信息,请遵循此 link:
Remove Duplicates Using UNION And EXCEPT Operator
我有两个 table。一个是 CustomerOrders,另一个是 OrderCustomerRef - lookup table。
两个 table 都具有一对多关系 - 一个客户可能与多个订单相关联。
CustomerOrders table 有重复的客户(相同的 LName、FName、Email)。但是他们有不同的Cust_IDs.
我需要合并基本客户 table 中的所有重复联系人(一对一)。 (此 table 未在此处显示)。
第 1 步:
需要找出哪个 Cust_ID 应该合并到哪个对应的重复客户(相同的 LName、FName、Email)中。具有最新 Order_Date 的联系人应该赢得其对应的重复对方(客户)。 VIP 客户除外 - 无论 Order_Date.
,他们应该始终是获胜者第 2 步: 更新了 OrderCustomerRef table:用获胜的 Cust_IDs 替换所有失败的副本 Cust_IDs。
第 3 步: 从基本客户 table 中删除所有丢失的联系人(当前范围内没有。我会自己做)。
IF OBJECT_ID('tempdb..#table') IS NOT NULL
DROP TABLE #table;
IF OBJECT_ID('tempdb..#CustomerOrders') IS NOT NULL
DROP TABLE #CustomerOrders;
IF OBJECT_ID('tempdb..#OrderCustomerRef') IS NOT NULL
DROP TABLE #OrderCustomerRef;
CREATE TABLE #CustomerOrders
(
[PK_ID] INT NOT NULL PRIMARY KEY IDENTITY(1,1),
Cust_ID INT NOT NULL,
LName VARCHAR(100) NULL,
FName VARCHAR(100) NULL,
[Customer_E-mail] VARCHAR(100) NULL,
Order_Date DATETIME NULL,
Customer_Source VARCHAR(100) NULL,
CustomerType VARCHAR(100) NULL
)
INSERT INTO #CustomerOrders (Cust_ID, LName, FName, [Customer_E-mail], Order_Date, Customer_Source, CustomerType)
VALUES
(1, 'John', 'Smith', 'JSmith@email.com', '2018-11-10 01:40:55.150', 'XYZ Company', 'Regular'),
(2, 'John', 'Smith', 'JSmith@email.com', '2018-10-10 05:05:55.150', 'Internet', 'VIP'),
(3, 'Adam', 'Burns', 'ABurns@email.com', '2017-05-05 00:00:00.000', 'XYZ Company','Regular'),
(3, 'Adam', 'Burns', 'ABurns@email.com', '2017-05-05 00:00:00.000', 'XYZ Company','VIP'),
(4, 'Adam', 'Burns', 'ABurns@email.com', '2017-05-05 00:00:00.000', 'Internet','Regular'),
(5, 'Adam', 'Burns', 'ABurns@email.com', '2017-05-05 00:00:00.000', 'Internet','VIP'),
(6, 'James', 'Snatcher', 'JSnatcher@email.com', '2019-07-07 00:00:00.000', 'XYZ Company', 'Regular'),
(7, 'James', 'Snatcher', 'JSnatcher@email.com', '2019-07-07 00:00:00.000', 'Internet','Regular'),
(9, 'Thomas', 'Johnson', 'TJohnson@email.com', '2016-05-01 00:00:00.000', 'Internet','Regular'),
(9, 'Thomas', 'Johnson', 'TJohnson@email.com', '2015-04-01 00:00:00.000', 'Internet','Regular'),
(10, 'Thomas', 'Johnson', 'TJohnson@email.com', '2014-03-01 00:00:00.000', 'Internet','Regular'),
(11, 'Thomas', 'Johnson', 'TJohnson@email.com', '2013-02-01 00:00:00.000', 'XYZ Company','Regular'),
(12, 'Peter', 'McDonald', 'PMcDonald@email.com', '2013-02-01 00:00:00.000', 'XYZ Company','Regular'),
(13, 'Jose', 'Mainster', 'JMainster@email.com', '2013-02-01 00:00:00.000', 'Internet','Regular'),
(14, 'Kevin', 'Digginton', 'KDigginton@email.com', '2013-02-01 00:00:00.000', 'Internet','Regular'),
(14, 'Kevin', 'Digginton', 'KDigginton@email.com', '2015-09-03 00:00:00.000', 'Internet','Regular')
CREATE TABLE #OrderCustomerRef
(
Raw_PK INT NOT NULL PRIMARY KEY IDENTITY(1,1),
OrderID INT NOT NULL,
Cust_ID INT NULL,
OrderType VARCHAR(100) NULL
)
INSERT INTO #OrderCustomerRef (OrderID, Cust_ID, OrderType)
VALUES
(1,1,'Online'),
(2,2,'Online'),
(3,3,'Online'),
(4,3,'Online'),
(5,4,'In Store'),
(6,5,'Online'),
(7,6,'Online'),
(8,7,'In Store'),
(9,9,'Online'),
(10,9,'Online'),
(11,10,'In Store'),
(12,11,'Online'),
(13,12,'Online'),
(14,13,'Online'),
(15,14,'Online'),
(16,14,'In Store')
-- SELECT * FROM #OrderCustomerRef
SELECT *,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail], Customer_Source ORDER BY Order_Date DESC) AS Rank_1,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail], Customer_Source ORDER BY Order_Date, CustomerType DESC ) AS Rank_CustType,
RANK() OVER (PARTITION BY Cust_ID, FName, LName, [Customer_E-mail], Customer_Source ORDER BY Order_Date, CustomerType DESC ) AS Rank_CustID,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail] ORDER BY Order_Date DESC) AS Rank_2,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail] ORDER BY Cust_ID) AS Rank_3
FROM #CustomerOrders
所需的输出应如下所示:
*异常: - 丢失客户 ID 1、3(应该是赢的,但由于有重复的对应方,它是 VIP,它正在丢失) - 中奖客户 ID 2、5(因为是 VIP,有例外)
例如:##OrderCustomerRef 中所有出现的 Cust_ID of John Smith 和 Cust_ID of 1 都应替换为 John Smith with Cust_ID of 2,所有出现的 Cust_ID of Adam Burns with Cust_ID of 3 should be replaced with Adam Burns with Cust_ID of 5
一般规则: - 丢失客户 ID 7、10、11、4 - 获胜客户 ID 6、9、12、13、14
例如:##OrderCustomerRef 中所有出现的 7 的 Cust_ID 应替换为 6,所有出现的 10 的 Cust_ID 应替换为 9*
最终我应该在 ##OrderCustomerRef table
中只有客户 ID 6、9、12、13、14、2、5使用 Rank_CustType_1、column_1、column_2 我可以弄清楚第 1 步。 但是我在第 2 步中仍然遇到问题 - 更新 OrderCustomerRef table:所有失败的 Cust_IDs 应该替换为相应的重复获胜 Cust_IDs.
我试过了。但这仍然不能取代失去 Cust_ID。
SELECT *,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail] ORDER BY Order_Date, CustomerType DESC) AS Rank_CustType_1,
RANK() OVER (PARTITION BY FName, LName, [Customer_E-mail] ORDER BY Cust_ID) AS Rank_3
INTO #table
FROM #CustomerOrders
; with cte as (
select Cust_ID, FName, LName, [Customer_E-mail], max(t.Rank_CustType_1) as Rank_CustType_1
,(select distinct Cust_ID from #table a where a.Cust_ID = t.Cust_ID and Rank_3 = 1) column_1
,(select distinct Cust_ID from #table a where a.Cust_ID = t.Cust_ID and Rank_3 <> 1) column_2
from #table t
group by Cust_ID, FName, LName, [Customer_E-mail]
)
update b
set Cust_ID = case
when b.Cust_ID = cte.Cust_ID and
b.Cust_ID = ISNULL(cte.column_1,'') and Rank_CustType_1 != 1 then b.Cust_ID
when b.Cust_ID = cte.Cust_ID and
b.Cust_ID = ISNULL(cte.column_2,'') and Rank_CustType_1 != 1 then cte.column_2
when b.Cust_ID = cte.Cust_ID and Rank_CustType_1 = 1 and cte.column_1 is null and cte.column_2 is not null then cte.column_2
when b.Cust_ID = cte.Cust_ID and Rank_CustType_1 = 1 and cte.column_1 is not null and cte.column_2 is null then cte.column_1
end
from #OrderCustomerRef b
inner join cte on b.Cust_ID = cte.Cust_ID;
select * from #OrderCustomerRef;
根据您提供的信息,我使用了以下 CTE 来显示您想要的结果:
WITH DaCTE -- To rank the existing rows
AS (
SELECT pk_ID
, cust_ID
, fname
, lname
, [customer_e-mail]
, Order_Date
, Customer_Source
, customertype
, ROW_NUMBER() OVER (PARTITION BY fname, lname, [customer_e-mail] ORDER BY customertype DESC, order_date DESC, cust_id) as RankYo -- Orders by the criteria provided but while you suggested 3 should lose to 5, they have the same criteria so either one could win based on ordering
FROM #customerorders
)
, NewSource -- To show winning Customer ID next to Original ID
AS (
SELECT co.pk_ID
, DaCTE.cust_ID as NewCustomerID
, co.cust_ID as OriginalCustomerID
, co.fname
, co.lname
, co.[customer_e-mail]
, co.Order_Date
, co.Customer_Source
, co.customertype
FROM DaCTE
INNER JOIN #CustomerOrders as co
ON co.fname = DaCTE.FName
AND co.lname = DaCTE.LName
AND co.[customer_e-mail] = DaCTE.[Customer_E-mail]
WHERE DaCTE.RankYo = 1 -- filter to show only the winning IDs based on resulting rank from previous CTE
)
SELECT *
/*UPDATE ocr --commented out so you can see the results before running update
SET ocr.Cust_ID = ns.NewCustomerID*/
FROM #OrderCustomerRef as ocr
INNER JOIN NewSource as ns
ON ns.OriginalCustomerID = ocr.Cust_ID
您可以使用 CTE(通用 Table 表达式)尝试此操作,如上述答案中 Antoine Hernandez 所解释的那样,您还可以使用 UNION 和 EXCEPT 运算符从 table 中删除重复项。
i.g 使用 EXCEPT 运算符
SELECT * FROM #customerorders
EXCEPT
SELECT * FROM #customerorders WHERE 1=0
i.g 使用 UNION 运算符
SELECT * FROM #customerorders
UNION
SELECT * FROM #customerorders WHERE 1=0
有关如何使用 CTE 删除重复项的更多信息,请遵循此 link:Remove Duplicates Using CTE
有关如何使用 UNION 和 EXCEPT 运算符删除重复项的更多信息,请遵循此 link: Remove Duplicates Using UNION And EXCEPT Operator