优化大型 table In SQL Server 2008/12 的 while 循环

Optimize while loop for large table In SQL Server 2008/12

Table 名称 [dbo].[SourceData] 有 1900 万行。

我是 运行 while 循环反对这个 table 并且根据匹配标准它将数据加载到另一个 table。 While 循环花费的时间比以往任何时候都长。

示例代码如下。 Sourcedata table 有 seqno 是唯一的 identity 列(主键)。名字、姓氏、地址、电子邮件地址也有单独的 NC 索引。

create table #holdscore
(
     seqno bigint, 
     associatedseq bigint, 
     scrore int, 
     status varchar(20),
     customerid varchar(30)

     CONSTRAINT [PK_SourceScores] 
         PRIMARY KEY CLUSTERED (seqno ASC, associatedseq ASC) 
) 

Create table #loop 
(
     seqno bigint primary key clustered, 
     Flag varchar(1) NULL
)

Insert #loop (seqno)
   select distinct TOP 1000 seqno     
   from [dbo].[SourceData] 
   order by seqno

Declare @seqno bigint
Declare @firstname Nvarchar(100)
Declare @lastname Nvarchar(100)
Declare @phonenum nvarchar(100)
Declare @emailadd Nvarchar(100)
Declare @Address Nvarchar(250)
Declare @MiddleName nvarchar(50)
Declare @CCExpYYMM nvarchar(4)
Declare @CCLastFour nvarchar(4)

While ((select count(*) from  #Loop where flag is null)>0)
Begin 
    Select top 1 @seqno = seqno from #Loop  where flag is null

    Select @firstname = [FirstName],
           @lastname = [LastName],
           @phonenum = [PhoneNorm],
           @emailadd = [EmailAddress],
           @Address = [AddressNorm],
           @MiddleName = [MiddleName],
           @CCExpYYMM  = [CCExpYYMM],
           @CCLastFour = [CCLastFour]
     from  [dbo].[SourceData]
     where seqno = @seqno

     INSERT #holdscore
         select 
             orginalseqno, associatedseq, score, 
             case when score >= 80 Then 'Match'
                  when score < 80 Then 'Review' 
             end as Status,  
             customerid 
         from
             (select 
                  @seqno orginalseqno, seqno as associatedseq,
                  customerid,
                  case
                      when [FirstName] = @firstname 
                       and [LastName] = @lastname 
                       and [PhoneNorm] = @phonenum 
                       and [EmailAddress] = @emailadd
                       and [AddressNorm] = @Address 
                       and [MiddleName] = @MiddleName 
                       and [CCExpYYMM]  = @CCExpYYMM  
                       and [CCLastFour] = @CCLastFour THEN '100'

                     when [FirstName] = @firstname 
                      and [LastName] = @lastname 
                      and [PhoneNorm] = @phonenum 
                      and [EmailAddress] = @emailadd
                      and [AddressNorm] = @Address 
                      and [MiddleName] = @MiddleName 
                      and [CCExpYYMM]  = @CCExpYYMM THEN '99'

                    when [FirstName] = @firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
        and [AddressNorm] = @Address and [MiddleName] = @MiddleName and [CCLastFour] = @CCLastFour THEN '99'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
        and [AddressNorm] = @Address and [MiddleName] = @MiddleName                             Then '98'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
        and [AddressNorm] = @Address                                                            Then '93'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd  Then '83'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum                               Then '68'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [EmailAddress]=@emailadd                            Then '63'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [AddressNorm] = @Address  Then '78'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [EmailAddress]=@emailadd and [AddressNorm] = @Address  Then '73'
    WHEN [FirstName]=@firstname and [LastName]=@lastname and [AddressNorm] = @Address                               Then '58'
    WHEN [FirstName]=@firstname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address and [MiddleName] = @MiddleName  Then '73'
    WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address and [MiddleName] = @MiddleName  THEN '75'
    WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address                                 Then '70'
    WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd                                                              THEN '60' 
    END AS Score
  From   [dbo].[SourceData]

  )A 
  where A.Score is not null
  OPTION (MAXDOP 8)

Update #Loop
set Flag = 'Y'
where seqno =@seqno and Flag is null


end

1000 个唯一的 seqno 需要 1 个多小时才能完成。我需要将 1900 万行相互比较并将其加载到 table。请帮我加快这个过程。这样我就可以及时加载数据。 SSIS 也可以。

以此为基础

select s1.seqno as orginalseqno, s2,seqno as associatedseq, 100, 'Match', s2.customerid
 from [SourceData] as s1
 join [SourceData] as s2
   on s2.[FirstName]    = s1.firstname 
  and s2.[LastName]     = s1.lastname 
  and s2.[PhoneNorm]    = s1.phonenum 
  and s2.[EmailAddress] = s1.emailadd
  and s2.[AddressNorm]  = s1.Address 
  and s2.[MiddleName]   = s1.MiddleName 
  and s2.[CCExpYYMM]    = s1.CCExpYYMM  
  and s2.[CCLastFour]   = s1.CCLastFour 

从那里降低分数并向左连接到插入 table 这样您就可以避免插入已经存在的具有更高分数的数据。一般来说,不要尝试构建消除更高分数的复杂查询,除非它是一个非常简单的查询,例如 99 is s2.[CCLastFour] <> s1.CCLastFour。

我的回答非常像 Frisbee 的回答(在每个分数组测试之间使用 UNION ALL)所以我不会费心发布 SQL。不过,我要补充的是,虽然这是您可能想要的解决方案,但当 运行 超过 1900 万行 table 时,即使这种基于集合的方法也将是一个非常强大的查询。据我所知,您正在尝试查找 table 中人员之间的关联度或相似度。如果我理解正确的话,你想将每个人与其他人进行比较。如果姓名和地址以及出生日期(或其他)的匹配得分为 100,则让下一次测试稍微不那么严格并分配较低的分数,依此类推。随着测试变得越来越弱,自连接变得越来越像交叉连接——你会得到更多的点击。如果您正在测试的列中的基数较低(很多重复值),您最终可能会生成数百万(或数十亿,甚至数万亿)行。请注意,只测试将 return 具有实用价值的结果的关联。对于(一个极端的)示例,如果您仅根据性别测试相似性,您最终会有效地得到两个 950 万行交叉连接。