优化大型 table In SQL Server 2008/12 的 while 循环
Optimize while loop for large table In SQL Server 2008/12
Table 名称 [dbo].[SourceData]
有 1900 万行。
我是 运行 while 循环反对这个 table 并且根据匹配标准它将数据加载到另一个 table。 While 循环花费的时间比以往任何时候都长。
示例代码如下。 Sourcedata
table 有 seqno
是唯一的 identity
列(主键)。名字、姓氏、地址、电子邮件地址也有单独的 NC 索引。
create table #holdscore
(
seqno bigint,
associatedseq bigint,
scrore int,
status varchar(20),
customerid varchar(30)
CONSTRAINT [PK_SourceScores]
PRIMARY KEY CLUSTERED (seqno ASC, associatedseq ASC)
)
Create table #loop
(
seqno bigint primary key clustered,
Flag varchar(1) NULL
)
Insert #loop (seqno)
select distinct TOP 1000 seqno
from [dbo].[SourceData]
order by seqno
Declare @seqno bigint
Declare @firstname Nvarchar(100)
Declare @lastname Nvarchar(100)
Declare @phonenum nvarchar(100)
Declare @emailadd Nvarchar(100)
Declare @Address Nvarchar(250)
Declare @MiddleName nvarchar(50)
Declare @CCExpYYMM nvarchar(4)
Declare @CCLastFour nvarchar(4)
While ((select count(*) from #Loop where flag is null)>0)
Begin
Select top 1 @seqno = seqno from #Loop where flag is null
Select @firstname = [FirstName],
@lastname = [LastName],
@phonenum = [PhoneNorm],
@emailadd = [EmailAddress],
@Address = [AddressNorm],
@MiddleName = [MiddleName],
@CCExpYYMM = [CCExpYYMM],
@CCLastFour = [CCLastFour]
from [dbo].[SourceData]
where seqno = @seqno
INSERT #holdscore
select
orginalseqno, associatedseq, score,
case when score >= 80 Then 'Match'
when score < 80 Then 'Review'
end as Status,
customerid
from
(select
@seqno orginalseqno, seqno as associatedseq,
customerid,
case
when [FirstName] = @firstname
and [LastName] = @lastname
and [PhoneNorm] = @phonenum
and [EmailAddress] = @emailadd
and [AddressNorm] = @Address
and [MiddleName] = @MiddleName
and [CCExpYYMM] = @CCExpYYMM
and [CCLastFour] = @CCLastFour THEN '100'
when [FirstName] = @firstname
and [LastName] = @lastname
and [PhoneNorm] = @phonenum
and [EmailAddress] = @emailadd
and [AddressNorm] = @Address
and [MiddleName] = @MiddleName
and [CCExpYYMM] = @CCExpYYMM THEN '99'
when [FirstName] = @firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
and [AddressNorm] = @Address and [MiddleName] = @MiddleName and [CCLastFour] = @CCLastFour THEN '99'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
and [AddressNorm] = @Address and [MiddleName] = @MiddleName Then '98'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
and [AddressNorm] = @Address Then '93'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd Then '83'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum Then '68'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [EmailAddress]=@emailadd Then '63'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [AddressNorm] = @Address Then '78'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [EmailAddress]=@emailadd and [AddressNorm] = @Address Then '73'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [AddressNorm] = @Address Then '58'
WHEN [FirstName]=@firstname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address and [MiddleName] = @MiddleName Then '73'
WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address and [MiddleName] = @MiddleName THEN '75'
WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address Then '70'
WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd THEN '60'
END AS Score
From [dbo].[SourceData]
)A
where A.Score is not null
OPTION (MAXDOP 8)
Update #Loop
set Flag = 'Y'
where seqno =@seqno and Flag is null
end
1000 个唯一的 seqno 需要 1 个多小时才能完成。我需要将 1900 万行相互比较并将其加载到 table。请帮我加快这个过程。这样我就可以及时加载数据。 SSIS 也可以。
以此为基础
select s1.seqno as orginalseqno, s2,seqno as associatedseq, 100, 'Match', s2.customerid
from [SourceData] as s1
join [SourceData] as s2
on s2.[FirstName] = s1.firstname
and s2.[LastName] = s1.lastname
and s2.[PhoneNorm] = s1.phonenum
and s2.[EmailAddress] = s1.emailadd
and s2.[AddressNorm] = s1.Address
and s2.[MiddleName] = s1.MiddleName
and s2.[CCExpYYMM] = s1.CCExpYYMM
and s2.[CCLastFour] = s1.CCLastFour
从那里降低分数并向左连接到插入 table 这样您就可以避免插入已经存在的具有更高分数的数据。一般来说,不要尝试构建消除更高分数的复杂查询,除非它是一个非常简单的查询,例如 99 is s2.[CCLastFour] <> s1.CCLastFour。
我的回答非常像 Frisbee 的回答(在每个分数组测试之间使用 UNION ALL)所以我不会费心发布 SQL。不过,我要补充的是,虽然这是您可能想要的解决方案,但当 运行 超过 1900 万行 table 时,即使这种基于集合的方法也将是一个非常强大的查询。据我所知,您正在尝试查找 table 中人员之间的关联度或相似度。如果我理解正确的话,你想将每个人与其他人进行比较。如果姓名和地址以及出生日期(或其他)的匹配得分为 100,则让下一次测试稍微不那么严格并分配较低的分数,依此类推。随着测试变得越来越弱,自连接变得越来越像交叉连接——你会得到更多的点击。如果您正在测试的列中的基数较低(很多重复值),您最终可能会生成数百万(或数十亿,甚至数万亿)行。请注意,只测试将 return 具有实用价值的结果的关联。对于(一个极端的)示例,如果您仅根据性别测试相似性,您最终会有效地得到两个 950 万行交叉连接。
Table 名称 [dbo].[SourceData]
有 1900 万行。
我是 运行 while 循环反对这个 table 并且根据匹配标准它将数据加载到另一个 table。 While 循环花费的时间比以往任何时候都长。
示例代码如下。 Sourcedata
table 有 seqno
是唯一的 identity
列(主键)。名字、姓氏、地址、电子邮件地址也有单独的 NC 索引。
create table #holdscore
(
seqno bigint,
associatedseq bigint,
scrore int,
status varchar(20),
customerid varchar(30)
CONSTRAINT [PK_SourceScores]
PRIMARY KEY CLUSTERED (seqno ASC, associatedseq ASC)
)
Create table #loop
(
seqno bigint primary key clustered,
Flag varchar(1) NULL
)
Insert #loop (seqno)
select distinct TOP 1000 seqno
from [dbo].[SourceData]
order by seqno
Declare @seqno bigint
Declare @firstname Nvarchar(100)
Declare @lastname Nvarchar(100)
Declare @phonenum nvarchar(100)
Declare @emailadd Nvarchar(100)
Declare @Address Nvarchar(250)
Declare @MiddleName nvarchar(50)
Declare @CCExpYYMM nvarchar(4)
Declare @CCLastFour nvarchar(4)
While ((select count(*) from #Loop where flag is null)>0)
Begin
Select top 1 @seqno = seqno from #Loop where flag is null
Select @firstname = [FirstName],
@lastname = [LastName],
@phonenum = [PhoneNorm],
@emailadd = [EmailAddress],
@Address = [AddressNorm],
@MiddleName = [MiddleName],
@CCExpYYMM = [CCExpYYMM],
@CCLastFour = [CCLastFour]
from [dbo].[SourceData]
where seqno = @seqno
INSERT #holdscore
select
orginalseqno, associatedseq, score,
case when score >= 80 Then 'Match'
when score < 80 Then 'Review'
end as Status,
customerid
from
(select
@seqno orginalseqno, seqno as associatedseq,
customerid,
case
when [FirstName] = @firstname
and [LastName] = @lastname
and [PhoneNorm] = @phonenum
and [EmailAddress] = @emailadd
and [AddressNorm] = @Address
and [MiddleName] = @MiddleName
and [CCExpYYMM] = @CCExpYYMM
and [CCLastFour] = @CCLastFour THEN '100'
when [FirstName] = @firstname
and [LastName] = @lastname
and [PhoneNorm] = @phonenum
and [EmailAddress] = @emailadd
and [AddressNorm] = @Address
and [MiddleName] = @MiddleName
and [CCExpYYMM] = @CCExpYYMM THEN '99'
when [FirstName] = @firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
and [AddressNorm] = @Address and [MiddleName] = @MiddleName and [CCLastFour] = @CCLastFour THEN '99'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
and [AddressNorm] = @Address and [MiddleName] = @MiddleName Then '98'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd
and [AddressNorm] = @Address Then '93'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd Then '83'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum Then '68'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [EmailAddress]=@emailadd Then '63'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [PhoneNorm]=@phonenum and [AddressNorm] = @Address Then '78'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [EmailAddress]=@emailadd and [AddressNorm] = @Address Then '73'
WHEN [FirstName]=@firstname and [LastName]=@lastname and [AddressNorm] = @Address Then '58'
WHEN [FirstName]=@firstname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address and [MiddleName] = @MiddleName Then '73'
WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address and [MiddleName] = @MiddleName THEN '75'
WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd and [AddressNorm] = @Address Then '70'
WHEN [LastName]=@lastname and [PhoneNorm]=@phonenum and [EmailAddress]=@emailadd THEN '60'
END AS Score
From [dbo].[SourceData]
)A
where A.Score is not null
OPTION (MAXDOP 8)
Update #Loop
set Flag = 'Y'
where seqno =@seqno and Flag is null
end
1000 个唯一的 seqno 需要 1 个多小时才能完成。我需要将 1900 万行相互比较并将其加载到 table。请帮我加快这个过程。这样我就可以及时加载数据。 SSIS 也可以。
以此为基础
select s1.seqno as orginalseqno, s2,seqno as associatedseq, 100, 'Match', s2.customerid
from [SourceData] as s1
join [SourceData] as s2
on s2.[FirstName] = s1.firstname
and s2.[LastName] = s1.lastname
and s2.[PhoneNorm] = s1.phonenum
and s2.[EmailAddress] = s1.emailadd
and s2.[AddressNorm] = s1.Address
and s2.[MiddleName] = s1.MiddleName
and s2.[CCExpYYMM] = s1.CCExpYYMM
and s2.[CCLastFour] = s1.CCLastFour
从那里降低分数并向左连接到插入 table 这样您就可以避免插入已经存在的具有更高分数的数据。一般来说,不要尝试构建消除更高分数的复杂查询,除非它是一个非常简单的查询,例如 99 is s2.[CCLastFour] <> s1.CCLastFour。
我的回答非常像 Frisbee 的回答(在每个分数组测试之间使用 UNION ALL)所以我不会费心发布 SQL。不过,我要补充的是,虽然这是您可能想要的解决方案,但当 运行 超过 1900 万行 table 时,即使这种基于集合的方法也将是一个非常强大的查询。据我所知,您正在尝试查找 table 中人员之间的关联度或相似度。如果我理解正确的话,你想将每个人与其他人进行比较。如果姓名和地址以及出生日期(或其他)的匹配得分为 100,则让下一次测试稍微不那么严格并分配较低的分数,依此类推。随着测试变得越来越弱,自连接变得越来越像交叉连接——你会得到更多的点击。如果您正在测试的列中的基数较低(很多重复值),您最终可能会生成数百万(或数十亿,甚至数万亿)行。请注意,只测试将 return 具有实用价值的结果的关联。对于(一个极端的)示例,如果您仅根据性别测试相似性,您最终会有效地得到两个 950 万行交叉连接。