将潜在的重复项分成不同的行
Separate Potential Duplicates Into Different Rows
我正在尝试根据 SSN、姓氏和出生日期的后 4 位来识别数据库中的潜在重复客户。我编写的存储过程确实识别了潜在的重复项,但它在一行中列出了它们 - 出于报告原因,我试图拆分成单独的行。
我的 T-SQL 看起来像:
DECLARE
@StartDate DATE = '1/1/2017',
@EndDate DATE = '3/1/2017';
SELECT DENSE_RANK() OVER (ORDER BY c.socialSecurityNumber) AS [SSNRanking] ,
ROW_NUMBER() OVER (PARTITION BY c.socialSecurityNumber ORDER BY c.socialSecurityNumber) AS [RowNumb] ,
c.socialSecurityNumber AS [SSN],
c.id AS [CustomerID] ,
c.firstName AS [FirstName] ,
c.lastName AS [lastName] ,
c.birthDate [birthdate] ,
c.createDate AS [CreateDate] ,
c2.socialSecurityNumber AS [DupSSN] ,
c2.id AS [DupCustomerID] ,
c2.firstName AS [DupFirstName] ,
c2.lastName AS [DupLastName] ,
c2.birthDate AS [DupBirthDate] ,
c2.createDate AS [DupCreateDate]
FROM dbo.Customers AS [c]
INNER JOIN dbo.Customers AS [c2] ON ( SUBSTRING(c.socialSecurityNumber,6,4) = SUBSTRING(c2.socialSecurityNumber,6,4) AND c.birthDate = c2.birthDate AND c.lastName = c2.lastName AND c.id <> c2.id )
LEFT JOIN dbo.CustomerAddresses AS [CA] ON c.id = CA.customerID
LEFT OUTER JOIN dbo.Common_Orders AS [co] ON co.customerID = c.id
WHERE
c.customerStatusTypeID <> 'M'
AND C2.customerStatusTypeID <> 'M'
AND c.mergedTo IS NULL
AND c2.mergedTo IS NULL
AND CAST(co.orderDate AS DATE) >= @StartDate
AND CAST(co.orderDate AS DATE) <= @EndDate
AND c.id = 1234439
GROUP BY c.socialSecurityNumber ,
c.id ,
c.firstName ,
c.lastName ,
c.birthDate ,
c.createDate ,
c2.socialSecurityNumber ,
c2.id ,
c2.firstName ,
c2.lastName ,
c2.birthDate ,
c2.createDate
ORDER BY CAST(c.socialSecurityNumber AS INT) ASC;
我的数据集如下所示:
SSNRanking RowNumb SSN CustomerID FirstName lastName birthdate CreateDate DupSSN DupCustomerID DupFirstName DupLastName DupBirthDate DupCreateDate
1 1 000009915 1234439 GREG GARRETT 1900-01-01 2014-02-25 000009915 1166084 ADAM GARRETT 1900-01-01 2013-08-29
在此特定情况下,我有两个用户具有相同的 SSN 最后 4 位、相同的姓氏和相同的出生日期 - 但不同的名字。
如何让这两条记录分行显示?理想情况下,我希望看到:
SSNRanking RowNumb SSN CustomerID FirstName lastName birthdate CreateDate
1 1 000009915 1234439 GREG GARRETT 1900-01-01 2014-02-25
1 2 000009915 1166084 ADAM GARRETT 1900-01-01 2013-08-29
但我不确定加入同一个 table 时如何完成此操作。建议?
我正在链接到一个脚本,该脚本创建两个有问题的 table 并插入示例数据。希望那是 acceptable: SQL Script
这叫做"unpivot"。您可以使用 UNPIVOT
运算符,但我更喜欢使用 CROSS APPLY ... VALUES
.
我会将您的查询包装到 CTE 中而不详细查看它,并使用 CROSS APPLY
.
将每一行分成两行
DECLARE
@StartDate DATE = '1/1/2017',
@EndDate DATE = '3/1/2017';
WITH
CTE
AS
(
SELECT
DENSE_RANK() OVER (ORDER BY c.socialSecurityNumber) AS [SSNRanking] ,
ROW_NUMBER() OVER (PARTITION BY c.socialSecurityNumber ORDER BY c.socialSecurityNumber) AS [RowNumb] ,
c.socialSecurityNumber AS [SSN],
c.id AS [CustomerID] ,
c.firstName AS [FirstName] ,
c.lastName AS [lastName] ,
c.birthDate [birthdate] ,
c.createDate AS [CreateDate] ,
c2.socialSecurityNumber AS [DupSSN] ,
c2.id AS [DupCustomerID] ,
c2.firstName AS [DupFirstName] ,
c2.lastName AS [DupLastName] ,
c2.birthDate AS [DupBirthDate] ,
c2.createDate AS [DupCreateDate]
FROM
dbo.Customers AS [c]
INNER JOIN dbo.Customers AS [c2] ON ( SUBSTRING(c.socialSecurityNumber,6,4) = SUBSTRING(c2.socialSecurityNumber,6,4) AND c.birthDate = c2.birthDate AND c.lastName = c2.lastName AND c.id <> c2.id )
LEFT JOIN dbo.CustomerAddresses AS [CA] ON c.id = CA.customerID
LEFT JOIN dbo.Common_Orders AS [co] ON co.customerID = c.id
WHERE
c.customerStatusTypeID <> 'M'
AND C2.customerStatusTypeID <> 'M'
AND c.mergedTo IS NULL
AND c2.mergedTo IS NULL
AND CAST(co.orderDate AS DATE) >= @StartDate
AND CAST(co.orderDate AS DATE) <= @EndDate
AND c.id = 1234439
GROUP BY
c.socialSecurityNumber ,
c.id ,
c.firstName ,
c.lastName ,
c.birthDate ,
c.createDate ,
c2.socialSecurityNumber ,
c2.id ,
c2.firstName ,
c2.lastName ,
c2.birthDate ,
c2.createDate
)
SELECT
CA.SSNRanking
,CA.RowNumb
,CA.SSN
,CA.CustomerID
,CA.FirstName
,CA.lastName
,CA.birthdate
,CA.CreateDate
FROM
CTE
CROSS APPLY
(
VALUES
(CTE.SSNRanking, CTE.RowNumb, CTE.SSN, CTE.CustomerID, CTE.FirstName, CTE.lastName, CTE.birthdate, CTE.CreateDate),
(CTE.SSNRanking, CTE.RowNumb, CTE.DupSSN, CTE.DupCustomerID, CTE.DupFirstName, CTE.DuplastName, CTE.Dupbirthdate, CTE.DupCreateDate)
) AS CA(SSNRanking, RowNumb, SSN, CustomerID, FirstName, lastName, birthdate, CreateDate)
ORDER BY CAST(CA.SSN AS INT) ASC;
顺便说一句,
ROW_NUMBER() OVER (PARTITION BY ColumnA ORDER BY ColumnA)
按同一列进行分区和排序时没有意义。我不确定你想在那里实现什么。
我正在尝试根据 SSN、姓氏和出生日期的后 4 位来识别数据库中的潜在重复客户。我编写的存储过程确实识别了潜在的重复项,但它在一行中列出了它们 - 出于报告原因,我试图拆分成单独的行。
我的 T-SQL 看起来像:
DECLARE
@StartDate DATE = '1/1/2017',
@EndDate DATE = '3/1/2017';
SELECT DENSE_RANK() OVER (ORDER BY c.socialSecurityNumber) AS [SSNRanking] ,
ROW_NUMBER() OVER (PARTITION BY c.socialSecurityNumber ORDER BY c.socialSecurityNumber) AS [RowNumb] ,
c.socialSecurityNumber AS [SSN],
c.id AS [CustomerID] ,
c.firstName AS [FirstName] ,
c.lastName AS [lastName] ,
c.birthDate [birthdate] ,
c.createDate AS [CreateDate] ,
c2.socialSecurityNumber AS [DupSSN] ,
c2.id AS [DupCustomerID] ,
c2.firstName AS [DupFirstName] ,
c2.lastName AS [DupLastName] ,
c2.birthDate AS [DupBirthDate] ,
c2.createDate AS [DupCreateDate]
FROM dbo.Customers AS [c]
INNER JOIN dbo.Customers AS [c2] ON ( SUBSTRING(c.socialSecurityNumber,6,4) = SUBSTRING(c2.socialSecurityNumber,6,4) AND c.birthDate = c2.birthDate AND c.lastName = c2.lastName AND c.id <> c2.id )
LEFT JOIN dbo.CustomerAddresses AS [CA] ON c.id = CA.customerID
LEFT OUTER JOIN dbo.Common_Orders AS [co] ON co.customerID = c.id
WHERE
c.customerStatusTypeID <> 'M'
AND C2.customerStatusTypeID <> 'M'
AND c.mergedTo IS NULL
AND c2.mergedTo IS NULL
AND CAST(co.orderDate AS DATE) >= @StartDate
AND CAST(co.orderDate AS DATE) <= @EndDate
AND c.id = 1234439
GROUP BY c.socialSecurityNumber ,
c.id ,
c.firstName ,
c.lastName ,
c.birthDate ,
c.createDate ,
c2.socialSecurityNumber ,
c2.id ,
c2.firstName ,
c2.lastName ,
c2.birthDate ,
c2.createDate
ORDER BY CAST(c.socialSecurityNumber AS INT) ASC;
我的数据集如下所示:
SSNRanking RowNumb SSN CustomerID FirstName lastName birthdate CreateDate DupSSN DupCustomerID DupFirstName DupLastName DupBirthDate DupCreateDate
1 1 000009915 1234439 GREG GARRETT 1900-01-01 2014-02-25 000009915 1166084 ADAM GARRETT 1900-01-01 2013-08-29
在此特定情况下,我有两个用户具有相同的 SSN 最后 4 位、相同的姓氏和相同的出生日期 - 但不同的名字。
如何让这两条记录分行显示?理想情况下,我希望看到:
SSNRanking RowNumb SSN CustomerID FirstName lastName birthdate CreateDate
1 1 000009915 1234439 GREG GARRETT 1900-01-01 2014-02-25
1 2 000009915 1166084 ADAM GARRETT 1900-01-01 2013-08-29
但我不确定加入同一个 table 时如何完成此操作。建议?
我正在链接到一个脚本,该脚本创建两个有问题的 table 并插入示例数据。希望那是 acceptable: SQL Script
这叫做"unpivot"。您可以使用 UNPIVOT
运算符,但我更喜欢使用 CROSS APPLY ... VALUES
.
我会将您的查询包装到 CTE 中而不详细查看它,并使用 CROSS APPLY
.
DECLARE
@StartDate DATE = '1/1/2017',
@EndDate DATE = '3/1/2017';
WITH
CTE
AS
(
SELECT
DENSE_RANK() OVER (ORDER BY c.socialSecurityNumber) AS [SSNRanking] ,
ROW_NUMBER() OVER (PARTITION BY c.socialSecurityNumber ORDER BY c.socialSecurityNumber) AS [RowNumb] ,
c.socialSecurityNumber AS [SSN],
c.id AS [CustomerID] ,
c.firstName AS [FirstName] ,
c.lastName AS [lastName] ,
c.birthDate [birthdate] ,
c.createDate AS [CreateDate] ,
c2.socialSecurityNumber AS [DupSSN] ,
c2.id AS [DupCustomerID] ,
c2.firstName AS [DupFirstName] ,
c2.lastName AS [DupLastName] ,
c2.birthDate AS [DupBirthDate] ,
c2.createDate AS [DupCreateDate]
FROM
dbo.Customers AS [c]
INNER JOIN dbo.Customers AS [c2] ON ( SUBSTRING(c.socialSecurityNumber,6,4) = SUBSTRING(c2.socialSecurityNumber,6,4) AND c.birthDate = c2.birthDate AND c.lastName = c2.lastName AND c.id <> c2.id )
LEFT JOIN dbo.CustomerAddresses AS [CA] ON c.id = CA.customerID
LEFT JOIN dbo.Common_Orders AS [co] ON co.customerID = c.id
WHERE
c.customerStatusTypeID <> 'M'
AND C2.customerStatusTypeID <> 'M'
AND c.mergedTo IS NULL
AND c2.mergedTo IS NULL
AND CAST(co.orderDate AS DATE) >= @StartDate
AND CAST(co.orderDate AS DATE) <= @EndDate
AND c.id = 1234439
GROUP BY
c.socialSecurityNumber ,
c.id ,
c.firstName ,
c.lastName ,
c.birthDate ,
c.createDate ,
c2.socialSecurityNumber ,
c2.id ,
c2.firstName ,
c2.lastName ,
c2.birthDate ,
c2.createDate
)
SELECT
CA.SSNRanking
,CA.RowNumb
,CA.SSN
,CA.CustomerID
,CA.FirstName
,CA.lastName
,CA.birthdate
,CA.CreateDate
FROM
CTE
CROSS APPLY
(
VALUES
(CTE.SSNRanking, CTE.RowNumb, CTE.SSN, CTE.CustomerID, CTE.FirstName, CTE.lastName, CTE.birthdate, CTE.CreateDate),
(CTE.SSNRanking, CTE.RowNumb, CTE.DupSSN, CTE.DupCustomerID, CTE.DupFirstName, CTE.DuplastName, CTE.Dupbirthdate, CTE.DupCreateDate)
) AS CA(SSNRanking, RowNumb, SSN, CustomerID, FirstName, lastName, birthdate, CreateDate)
ORDER BY CAST(CA.SSN AS INT) ASC;
顺便说一句,
ROW_NUMBER() OVER (PARTITION BY ColumnA ORDER BY ColumnA)
按同一列进行分区和排序时没有意义。我不确定你想在那里实现什么。