如果一列中的所有单词都包含在另一列中,则匹配两列
Match two columns if all the words in one are contained in the other
我正在尝试 link 将 2 列(在 2 个单独的表中)放在一起,这样如果一列中的每个词都包含在另一列中,那么它们就会匹配。
例如,应匹配以下值:
Paul Smith|Paul Andrew Smith
Paul Smith|Paul Andrew William Smith
Paul William Smith|Paul Andrew William Smith
Paul Andrew Smith|Paul Smith
但不应匹配以下内容:
Paul William Smith|Paul Andrew Smith
我正在使用 SQL Server 2016。
我想使用 SELECT
查询来完成此操作。我有一个模糊的想法,即使用 string_split
函数(在空格上),交叉应用 2 个表,然后使用 MAX
函数,但是如果我只处理几行,这将创建数百万行千个名字,所以它不会很有效。
示例数据:
DROP TABLE IF EXISTS #TEMP1
DROP TABLE IF EXISTS #TEMP2
CREATE TABLE #TEMP1 (NAME NVARCHAR(300))
CREATE TABLE #TEMP2 (NAME NVARCHAR(300))
INSERT #TEMP1 SELECT 'Paul Smith'
INSERT #TEMP1 SELECT 'Amy Nicholas Stanton'
INSERT #TEMP1 SELECT 'Andrew James Thomas'
INSERT #TEMP2 SELECT 'Paul Andrew Smith'
INSERT #TEMP2 SELECT 'Amy Stanton'
INSERT #TEMP2 SELECT 'Andrew Marcus Thomas'
所以从示例数据来看,前 2 行应该匹配,而后 3 行不应该匹配。
编辑:我已经将我模糊的想法付诸实践,以下解决方案有效,但正如我所料,当您处理包含数千行的表时它真的很慢。
SELECT DISTINCT A.[FIRSTNAME],A.[SECONDNAME]
FROM (
SELECT *
,MIN([FIRSTMATCH]) OVER(PARTITION BY [SRN],[FIRSTNAME]) [FM]
,MIN([SECONDMATCH]) OVER(PARTITION BY [FRN],[SECONDNAME]) [SM]
FROM (
SELECT DISTINCT A.NAME [FIRSTNAME]
,B.NAME [SECONDNAME]
,A.value [FIRSTVAL]
,MAX(IIF(A.VALUE=B.VALUE,1,0)) OVER(PARTITION BY A.VALUE,B.RN) [FIRSTMATCH]
,B.value [SECONDVAL]
,MAX(IIF(B.VALUE=A.VALUE,1,0)) OVER(PARTITION BY B.VALUE,A.RN) [SECONDMATCH]
,A.RN [FRN]
,B.RN [SRN]
FROM (
SELECT DISTINCT NAME, DENSE_RANK() OVER(ORDER BY NAME) [RN],value
FROM #TEMP1
CROSS APPLY STRING_SPLIT(LTRIM(RTRIM(NAME)),' ')
WHERE LTRIM(RTRIM(NAME)) !=''
)A
CROSS APPLY(
SELECT DISTINCT NAME, DENSE_RANK() OVER(ORDER BY NAME) [RN],value
FROM #TEMP2
CROSS APPLY STRING_SPLIT(LTRIM(RTRIM(NAME)),' ')
WHERE LTRIM(RTRIM(NAME)) !=''
)B
)A
)A
WHERE A.SM = 1 OR A.FM = 1
您可以拆分字符串并聚合。假设 none 个名称有重复部分:
with n1 as (
select temp1.name, value as part, count(value) over (partition by name) as num_parts
from temp1 cross apply
string_split(temp1.name, ' ')
),
n2 as (
select temp2.name, value as part, count(value) over (partition by name) as num_parts
from temp2 cross apply
string_split(temp2.name, ' ')
)
select n1.name, n2.name
from n1 join
n2
on n1.part = n2.part and n1.num_parts <= n2.num_parts
group by n1.name, n2.name, n1.num_parts
having count(*) = n1.num_parts;
Here 是一个 db<>fiddle.
基于 Gordon Linoff 的回答,这似乎可行:
;WITH N1 AS (
SELECT *,COUNT(*) OVER(PARTITION BY NAME) [NUM_PARTS]
FROM (
SELECT DISTINCT NAME, VALUE [PART]
FROM #TEMP1 CROSS APPLY
STRING_SPLIT(#TEMP1.NAME, ' ')
)A
),
N2 AS (
SELECT *,COUNT(*) OVER(PARTITION BY NAME) [NUM_PARTS]
FROM (
SELECT DISTINCT NAME, VALUE [PART]
FROM #TEMP2 CROSS APPLY
STRING_SPLIT(#TEMP2.NAME, ' ')
)A
)
SELECT N1.NAME, N2.NAME
FROM N1 JOIN N2 ON N1.PART = N2.PART
group by n1.name, n2.name, n1.num_parts,n2.num_parts
having count(n2.part) = n1.num_parts
or count(n1.part) = n2.num_parts
我正在尝试 link 将 2 列(在 2 个单独的表中)放在一起,这样如果一列中的每个词都包含在另一列中,那么它们就会匹配。
例如,应匹配以下值:
Paul Smith|Paul Andrew Smith
Paul Smith|Paul Andrew William Smith
Paul William Smith|Paul Andrew William Smith
Paul Andrew Smith|Paul Smith
但不应匹配以下内容:
Paul William Smith|Paul Andrew Smith
我正在使用 SQL Server 2016。
我想使用 SELECT
查询来完成此操作。我有一个模糊的想法,即使用 string_split
函数(在空格上),交叉应用 2 个表,然后使用 MAX
函数,但是如果我只处理几行,这将创建数百万行千个名字,所以它不会很有效。
示例数据:
DROP TABLE IF EXISTS #TEMP1
DROP TABLE IF EXISTS #TEMP2
CREATE TABLE #TEMP1 (NAME NVARCHAR(300))
CREATE TABLE #TEMP2 (NAME NVARCHAR(300))
INSERT #TEMP1 SELECT 'Paul Smith'
INSERT #TEMP1 SELECT 'Amy Nicholas Stanton'
INSERT #TEMP1 SELECT 'Andrew James Thomas'
INSERT #TEMP2 SELECT 'Paul Andrew Smith'
INSERT #TEMP2 SELECT 'Amy Stanton'
INSERT #TEMP2 SELECT 'Andrew Marcus Thomas'
所以从示例数据来看,前 2 行应该匹配,而后 3 行不应该匹配。
编辑:我已经将我模糊的想法付诸实践,以下解决方案有效,但正如我所料,当您处理包含数千行的表时它真的很慢。
SELECT DISTINCT A.[FIRSTNAME],A.[SECONDNAME]
FROM (
SELECT *
,MIN([FIRSTMATCH]) OVER(PARTITION BY [SRN],[FIRSTNAME]) [FM]
,MIN([SECONDMATCH]) OVER(PARTITION BY [FRN],[SECONDNAME]) [SM]
FROM (
SELECT DISTINCT A.NAME [FIRSTNAME]
,B.NAME [SECONDNAME]
,A.value [FIRSTVAL]
,MAX(IIF(A.VALUE=B.VALUE,1,0)) OVER(PARTITION BY A.VALUE,B.RN) [FIRSTMATCH]
,B.value [SECONDVAL]
,MAX(IIF(B.VALUE=A.VALUE,1,0)) OVER(PARTITION BY B.VALUE,A.RN) [SECONDMATCH]
,A.RN [FRN]
,B.RN [SRN]
FROM (
SELECT DISTINCT NAME, DENSE_RANK() OVER(ORDER BY NAME) [RN],value
FROM #TEMP1
CROSS APPLY STRING_SPLIT(LTRIM(RTRIM(NAME)),' ')
WHERE LTRIM(RTRIM(NAME)) !=''
)A
CROSS APPLY(
SELECT DISTINCT NAME, DENSE_RANK() OVER(ORDER BY NAME) [RN],value
FROM #TEMP2
CROSS APPLY STRING_SPLIT(LTRIM(RTRIM(NAME)),' ')
WHERE LTRIM(RTRIM(NAME)) !=''
)B
)A
)A
WHERE A.SM = 1 OR A.FM = 1
您可以拆分字符串并聚合。假设 none 个名称有重复部分:
with n1 as (
select temp1.name, value as part, count(value) over (partition by name) as num_parts
from temp1 cross apply
string_split(temp1.name, ' ')
),
n2 as (
select temp2.name, value as part, count(value) over (partition by name) as num_parts
from temp2 cross apply
string_split(temp2.name, ' ')
)
select n1.name, n2.name
from n1 join
n2
on n1.part = n2.part and n1.num_parts <= n2.num_parts
group by n1.name, n2.name, n1.num_parts
having count(*) = n1.num_parts;
Here 是一个 db<>fiddle.
基于 Gordon Linoff 的回答,这似乎可行:
;WITH N1 AS (
SELECT *,COUNT(*) OVER(PARTITION BY NAME) [NUM_PARTS]
FROM (
SELECT DISTINCT NAME, VALUE [PART]
FROM #TEMP1 CROSS APPLY
STRING_SPLIT(#TEMP1.NAME, ' ')
)A
),
N2 AS (
SELECT *,COUNT(*) OVER(PARTITION BY NAME) [NUM_PARTS]
FROM (
SELECT DISTINCT NAME, VALUE [PART]
FROM #TEMP2 CROSS APPLY
STRING_SPLIT(#TEMP2.NAME, ' ')
)A
)
SELECT N1.NAME, N2.NAME
FROM N1 JOIN N2 ON N1.PART = N2.PART
group by n1.name, n2.name, n1.num_parts,n2.num_parts
having count(n2.part) = n1.num_parts
or count(n1.part) = n2.num_parts