PATINDEX SOUNDEX
PATINDEX with SOUNDEX
想要使用 PATINDEX 和 SOUNDEX 搜索字符串。
我有以下 table 和一些示例数据,使用 PATINDEX
和 SOUNDEX
搜索给定的字符串。
create table tbl_pat_soundex
(
col_str varchar(max)
);
insert into tbl_pat_soundex values('Smith A Steve');
insert into tbl_pat_soundex values('Steve A Smyth');
insert into tbl_pat_soundex values('A Smeeth Stive');
insert into tbl_pat_soundex values('Steve Smith A');
insert into tbl_pat_soundex values('Smit Steve A');
要搜索的字符串:- 'Smith A Steve'
SELECT col_str,PATINDEX('%Smith%',col_str) [Smith],PATINDEX('%A%',col_str) [A],PATINDEX('%Steve%',col_str) [Steve]
FROM tbl_pat_soundex
获取输出:
col_str Smith A Steve
---------------------------------
Smith A Steve 1 7 9
Steve A Smyth 0 7 1
A Smeeth Stive 0 1 0
Steve Smith A 7 13 1
Smit Steve A 0 12 6
预期输出:
col_str Smith A Steve
---------------------------------
Smith A Steve 1 7 9
Steve A Smyth 9 7 1
A Smeeth Stive 3 1 10
Steve Smith A 7 13 1
Smit Steve A 1 12 6
尝试过:
SELECT col_str,
PATINDEX('%'+soundex('Smith')+'%',soundex(col_str)) [Smith],
PATINDEX('%'+soundex('A')+'%',soundex(col_str)) [A],
PATINDEX('%'+soundex('Steve')+'%',soundex(col_str)) [Steve]
FROM tbl_pat_soundex
但是得到意想不到的结果:
col_str Smith A Steve
---------------------------------
Smith A Steve 1 0 0
Steve A Smyth 0 0 1
A Smeeth Stive 0 1 0
Steve Smith A 0 0 1
Smit Steve A 1 0 0
注意:我在 table 中有 100 Millions
条记录要搜索。
这是一个选项,考虑到您需要做的所有事情,不确定它对 1 亿条记录的表现如何。你必须测试一下。
在高层次上我的理解是你基本上需要的
- 根据另一个字符串的词搜索一个字符串中的所有词
- 返回原始字符串中该词等于或听起来像搜索词的字符起始位置。
可以用DIFFERENCE()进行比较:
DIFFERENCE compares two different SOUNDEX values, and returns an
integer value. This value measures the degree that the SOUNDEX values
match, on a scale of 0 to 4. A value of 0 indicates weak or no
similarity between the SOUNDEX values; 4 indicates strongly similar,
or even identically matching, SOUNDEX values.
您需要根据 space ' ' 拆分字符串,并且由于您是 2008 年,因此您必须推出自己的函数。
我使用了这里的 XML 函数,https://sqlperformance.com/2012/07/t-sql-queries/split-strings,对于我的例子,如果你有自己的或者想使用不同的东西,你显然需要进行调整:
CREATE FUNCTION dbo.SplitStrings_XML
(
@List NVARCHAR(MAX),
@Delimiter NVARCHAR(255)
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
(
SELECT Item = y.i.value('(./text())[1]', 'nvarchar(4000)')
FROM
(
SELECT x = CONVERT(XML, '<i>'
+ REPLACE(@List, @Delimiter, '</i><i>')
+ '</i>').query('.')
) AS a CROSS APPLY x.nodes('i') AS y(i)
);
GO
我切换并使用 table 变量来显示示例,我建议不要使用您拥有的数据量来创建和使用物理 tables。
选项 1 - 非动态:
DECLARE @tbl_pat_soundex TABLE
(
[col_str] VARCHAR(MAX)
);
INSERT INTO @tbl_pat_soundex
VALUES ( 'Smith A Steve' )
,( 'Steve A Smyth' )
,( 'A Smeeth Stive' )
,( 'Steve Smith A' )
,( 'Smit Steve A' )
SELECT DISTINCT [aa].[col_str]
, MAX([aa].[Smith]) OVER ( PARTITION BY [aa].[col_str] ) AS [Smith]
, MAX([aa].[A]) OVER ( PARTITION BY [aa].[col_str] ) AS [A]
, MAX([aa].[Steve]) OVER ( PARTITION BY [aa].[col_str] ) AS [Steve]
FROM (
SELECT [a].[col_str]
, CASE WHEN DIFFERENCE([b].[item], 'Smith') = 4 THEN
CHARINDEX([b].[item], [a].[col_str])
ELSE 0
END AS [Smith]
, CASE WHEN DIFFERENCE([b].[item], 'A') = 4 THEN
CHARINDEX([b].[item], [a].[col_str])
ELSE 0
END AS [A]
, CASE WHEN DIFFERENCE([b].[item], 'Steve') = 4 THEN
CHARINDEX([b].[item], [a].[col_str])
ELSE 0
END AS [Steve]
FROM @tbl_pat_soundex [a]
CROSS APPLY [dbo].[SplitStrings_XML]([a].[col_str], ' ') [b]
) AS [aa];
- 使用该函数,我们将字符串拆分为单个单词
- 然后我们使用 case 语句来检查 DIFFERENCE 值
- 如果 DIFFERENCE 值等于 4,那么我们 return 原始单词对字符串的 CHARINDEX 值。
- 如果不等于我们 return 0
然后就是根据原始字符串获取每个值的最大值:
, MAX([aa].[Smith]) OVER ( PARTITION BY [aa].[col_str] ) AS [Smith]
, MAX([aa].[A]) OVER ( PARTITION BY [aa].[col_str] ) AS [A]
, MAX([aa].[Steve]) OVER ( PARTITION BY [aa].[col_str] ) AS [Steve]
为您提供最终结果:
选项 2 - 带枢轴的动态:
我们将声明要搜索的字符串,将其拆分并搜索原始字符串中的那些单独的单词,然后转换结果。
--This example is using global temp tables as it's showing how
--to build a dynamic pivot
IF OBJECT_ID('tempdb..##tbl_pat_soundex') IS NOT NULL
DROP TABLE [##tbl_pat_soundex];
IF OBJECT_ID('tempdb..##tbl_col_str_SearchString') IS NOT NULL
DROP TABLE [##tbl_col_str_SearchString];
CREATE TABLE [##tbl_pat_soundex]
(
[col_str] VARCHAR(MAX)
);
INSERT INTO [##tbl_pat_soundex]
VALUES ( 'Smith A Steve' )
, ( 'Steve A Smyth' )
, ( 'A Smeeth Stive' )
, ( 'Steve Smith A' )
, ( 'Smit Steve A' );
--What are you searching for?
DECLARE @SearchString NVARCHAR(200);
SET @SearchString = N'Smith A Steve';
--We build a table we load with every combination of the words from the string and the words from the SearchString for easier comparison.
CREATE TABLE [##tbl_col_str_SearchString]
(
[col_str] NVARCHAR(MAX)
, [col_str_value] NVARCHAR(MAX)
, [SearchValue] NVARCHAR(200)
);
--Load that table for comparison
--split our original string into individual words
--also split our search string into individual words and give me all combinations.
INSERT INTO [##tbl_col_str_SearchString] (
[col_str]
, [col_str_value]
, [SearchValue]
)
SELECT DISTINCT [a].[col_str]
, [b].[item]
, [c].[item]
FROM [##tbl_pat_soundex] [a]
CROSS APPLY [dbo].[SplitStrings_XML]([a].[col_str], ' ') [b]
CROSS APPLY [dbo].[SplitStrings_XML](@SearchString, ' ') [c]
ORDER BY [a].[col_str];
--Then we can easily compare each word and search word for those that match or sound alike using DIFFERNCE()
SELECT [col_str], [col_str_value], [SearchValue], CASE WHEN DIFFERENCE([col_str_value], [SearchValue]) = 4 THEN CHARINDEX([col_str_value], [col_str]) ELSE 0 END AS [Match] FROM ##tbl_col_str_SearchString
--Then we can pivot on it
--and we will need to make it dynamic since we are not sure what what @SearchString could be.
DECLARE @PivotSQL NVARCHAR(MAX);
DECLARE @pivotColumn NVARCHAR(MAX);
SET @pivotColumn = N'[' + REPLACE(@SearchString, ' ', '],[') + N']';
SET @PivotSQL = N'SELECT * FROM (
SELECT [col_str], [SearchValue], CASE WHEN DIFFERENCE([col_str_value], [SearchValue]) = 4 THEN CHARINDEX([col_str_value], [col_str]) ELSE 0 END AS [Match] FROM ##tbl_col_str_SearchString
) aa
PIVOT (MAX([Match]) FOR [SearchValue] IN (' + @pivotColumn
+ N')) AS MaxMatch
ORDER BY [MaxMatch].[col_str]
';
--Giving us the final results.
EXEC sp_executesql @PivotSQL
想要使用 PATINDEX 和 SOUNDEX 搜索字符串。
我有以下 table 和一些示例数据,使用 PATINDEX
和 SOUNDEX
搜索给定的字符串。
create table tbl_pat_soundex
(
col_str varchar(max)
);
insert into tbl_pat_soundex values('Smith A Steve');
insert into tbl_pat_soundex values('Steve A Smyth');
insert into tbl_pat_soundex values('A Smeeth Stive');
insert into tbl_pat_soundex values('Steve Smith A');
insert into tbl_pat_soundex values('Smit Steve A');
要搜索的字符串:- 'Smith A Steve'
SELECT col_str,PATINDEX('%Smith%',col_str) [Smith],PATINDEX('%A%',col_str) [A],PATINDEX('%Steve%',col_str) [Steve]
FROM tbl_pat_soundex
获取输出:
col_str Smith A Steve
---------------------------------
Smith A Steve 1 7 9
Steve A Smyth 0 7 1
A Smeeth Stive 0 1 0
Steve Smith A 7 13 1
Smit Steve A 0 12 6
预期输出:
col_str Smith A Steve
---------------------------------
Smith A Steve 1 7 9
Steve A Smyth 9 7 1
A Smeeth Stive 3 1 10
Steve Smith A 7 13 1
Smit Steve A 1 12 6
尝试过:
SELECT col_str,
PATINDEX('%'+soundex('Smith')+'%',soundex(col_str)) [Smith],
PATINDEX('%'+soundex('A')+'%',soundex(col_str)) [A],
PATINDEX('%'+soundex('Steve')+'%',soundex(col_str)) [Steve]
FROM tbl_pat_soundex
但是得到意想不到的结果:
col_str Smith A Steve
---------------------------------
Smith A Steve 1 0 0
Steve A Smyth 0 0 1
A Smeeth Stive 0 1 0
Steve Smith A 0 0 1
Smit Steve A 1 0 0
注意:我在 table 中有 100 Millions
条记录要搜索。
这是一个选项,考虑到您需要做的所有事情,不确定它对 1 亿条记录的表现如何。你必须测试一下。
在高层次上我的理解是你基本上需要的
- 根据另一个字符串的词搜索一个字符串中的所有词
- 返回原始字符串中该词等于或听起来像搜索词的字符起始位置。
可以用DIFFERENCE()进行比较:
DIFFERENCE compares two different SOUNDEX values, and returns an integer value. This value measures the degree that the SOUNDEX values match, on a scale of 0 to 4. A value of 0 indicates weak or no similarity between the SOUNDEX values; 4 indicates strongly similar, or even identically matching, SOUNDEX values.
您需要根据 space ' ' 拆分字符串,并且由于您是 2008 年,因此您必须推出自己的函数。
我使用了这里的 XML 函数,https://sqlperformance.com/2012/07/t-sql-queries/split-strings,对于我的例子,如果你有自己的或者想使用不同的东西,你显然需要进行调整:
CREATE FUNCTION dbo.SplitStrings_XML
(
@List NVARCHAR(MAX),
@Delimiter NVARCHAR(255)
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
(
SELECT Item = y.i.value('(./text())[1]', 'nvarchar(4000)')
FROM
(
SELECT x = CONVERT(XML, '<i>'
+ REPLACE(@List, @Delimiter, '</i><i>')
+ '</i>').query('.')
) AS a CROSS APPLY x.nodes('i') AS y(i)
);
GO
我切换并使用 table 变量来显示示例,我建议不要使用您拥有的数据量来创建和使用物理 tables。
选项 1 - 非动态:
DECLARE @tbl_pat_soundex TABLE
(
[col_str] VARCHAR(MAX)
);
INSERT INTO @tbl_pat_soundex
VALUES ( 'Smith A Steve' )
,( 'Steve A Smyth' )
,( 'A Smeeth Stive' )
,( 'Steve Smith A' )
,( 'Smit Steve A' )
SELECT DISTINCT [aa].[col_str]
, MAX([aa].[Smith]) OVER ( PARTITION BY [aa].[col_str] ) AS [Smith]
, MAX([aa].[A]) OVER ( PARTITION BY [aa].[col_str] ) AS [A]
, MAX([aa].[Steve]) OVER ( PARTITION BY [aa].[col_str] ) AS [Steve]
FROM (
SELECT [a].[col_str]
, CASE WHEN DIFFERENCE([b].[item], 'Smith') = 4 THEN
CHARINDEX([b].[item], [a].[col_str])
ELSE 0
END AS [Smith]
, CASE WHEN DIFFERENCE([b].[item], 'A') = 4 THEN
CHARINDEX([b].[item], [a].[col_str])
ELSE 0
END AS [A]
, CASE WHEN DIFFERENCE([b].[item], 'Steve') = 4 THEN
CHARINDEX([b].[item], [a].[col_str])
ELSE 0
END AS [Steve]
FROM @tbl_pat_soundex [a]
CROSS APPLY [dbo].[SplitStrings_XML]([a].[col_str], ' ') [b]
) AS [aa];
- 使用该函数,我们将字符串拆分为单个单词
- 然后我们使用 case 语句来检查 DIFFERENCE 值
- 如果 DIFFERENCE 值等于 4,那么我们 return 原始单词对字符串的 CHARINDEX 值。
- 如果不等于我们 return 0
然后就是根据原始字符串获取每个值的最大值:
, MAX([aa].[Smith]) OVER ( PARTITION BY [aa].[col_str] ) AS [Smith]
, MAX([aa].[A]) OVER ( PARTITION BY [aa].[col_str] ) AS [A]
, MAX([aa].[Steve]) OVER ( PARTITION BY [aa].[col_str] ) AS [Steve]
为您提供最终结果:
选项 2 - 带枢轴的动态:
我们将声明要搜索的字符串,将其拆分并搜索原始字符串中的那些单独的单词,然后转换结果。
--This example is using global temp tables as it's showing how
--to build a dynamic pivot
IF OBJECT_ID('tempdb..##tbl_pat_soundex') IS NOT NULL
DROP TABLE [##tbl_pat_soundex];
IF OBJECT_ID('tempdb..##tbl_col_str_SearchString') IS NOT NULL
DROP TABLE [##tbl_col_str_SearchString];
CREATE TABLE [##tbl_pat_soundex]
(
[col_str] VARCHAR(MAX)
);
INSERT INTO [##tbl_pat_soundex]
VALUES ( 'Smith A Steve' )
, ( 'Steve A Smyth' )
, ( 'A Smeeth Stive' )
, ( 'Steve Smith A' )
, ( 'Smit Steve A' );
--What are you searching for?
DECLARE @SearchString NVARCHAR(200);
SET @SearchString = N'Smith A Steve';
--We build a table we load with every combination of the words from the string and the words from the SearchString for easier comparison.
CREATE TABLE [##tbl_col_str_SearchString]
(
[col_str] NVARCHAR(MAX)
, [col_str_value] NVARCHAR(MAX)
, [SearchValue] NVARCHAR(200)
);
--Load that table for comparison
--split our original string into individual words
--also split our search string into individual words and give me all combinations.
INSERT INTO [##tbl_col_str_SearchString] (
[col_str]
, [col_str_value]
, [SearchValue]
)
SELECT DISTINCT [a].[col_str]
, [b].[item]
, [c].[item]
FROM [##tbl_pat_soundex] [a]
CROSS APPLY [dbo].[SplitStrings_XML]([a].[col_str], ' ') [b]
CROSS APPLY [dbo].[SplitStrings_XML](@SearchString, ' ') [c]
ORDER BY [a].[col_str];
--Then we can easily compare each word and search word for those that match or sound alike using DIFFERNCE()
SELECT [col_str], [col_str_value], [SearchValue], CASE WHEN DIFFERENCE([col_str_value], [SearchValue]) = 4 THEN CHARINDEX([col_str_value], [col_str]) ELSE 0 END AS [Match] FROM ##tbl_col_str_SearchString
--Then we can pivot on it
--and we will need to make it dynamic since we are not sure what what @SearchString could be.
DECLARE @PivotSQL NVARCHAR(MAX);
DECLARE @pivotColumn NVARCHAR(MAX);
SET @pivotColumn = N'[' + REPLACE(@SearchString, ' ', '],[') + N']';
SET @PivotSQL = N'SELECT * FROM (
SELECT [col_str], [SearchValue], CASE WHEN DIFFERENCE([col_str_value], [SearchValue]) = 4 THEN CHARINDEX([col_str_value], [col_str]) ELSE 0 END AS [Match] FROM ##tbl_col_str_SearchString
) aa
PIVOT (MAX([Match]) FOR [SearchValue] IN (' + @pivotColumn
+ N')) AS MaxMatch
ORDER BY [MaxMatch].[col_str]
';
--Giving us the final results.
EXEC sp_executesql @PivotSQL