我需要在 sql 服务器中识别字符串,这些字符串包含与给定字符串相同的关键字,没有特定的顺序
I need to identify strings, in sql server, that contain the same keywords as a given string in no particular order
我检查了类似的问题,但我不想使用全文搜索,也没有找到任何其他符合我需求的东西,所以我在下面写了这段代码 returns 名义上的一对字符串的单词匹配百分比。
它的工作原理是删除 "noise" 个单词和标点符号,然后将所有剩余的单词放入一个 table 变量(两个字符串各一个),然后将两个 table s 并计算连接数。然后将连接数除以其中一个字符串的字数,得到一个(标称的)百分比。它工作正常,但是当我在 select 中使用它从几千个字符串中识别 5 个最佳匹配时它有点慢!
所以我需要更好的方法或任何好的想法来提高速度...
CREATE FUNCTION [GLOBAL].[COMPARETITLESFUNC]
(
@title1 nvarchar(4000), @title2 nvarchar(4000)-- , @pcmatch int output
)
returns int
AS
BEGIN
DECLARE @pcm int, @matchedrows int, @totalwordcount int, @Counter int, @mul int;
DECLARE @words1 table (word nvarchar(500));
DECLARE @words2 table (word nvarchar(500));
DECLARE @S nvarchar(10);
DECLARE @temp nvarchar(4000);
set @pcm = 0;
set @matchedrows = 0;
set @totalwordcount = 0;
set @S = ' '; -- split char may be replace with other strings
set @temp = '';
set @mul = 1;
-- build two temp tables of the words from each string excluding common/noise words
-- words to exclude: in on the and verizon can too if it a at of or for by your with &
-- maybe one day store exclusion words in a codelist called compexclude if maintenance of them becomes an issue
-- join the tables and count the rows returned and divide by the total number of words in the two strings
--## first convert all strings to lower case and remove any extraneous spaces and any other punctuation
set @title1 = replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(@title1,':',@S),';',@S),',',@S),'!',@S),'-',@S),'?',@S),'.',@S),'%',@S),'$',@S),'&',@S),'£',@S),'"',@S) -- remove unwanted punctuation
set @title2 = replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(@title2,':',@S),';',@S),',',@S),'!',@S),'-',@S),'?',@S),'.',@S),'%',@S),'$',@S),'&',@S),'£',@S),'"',@S) -- remove unwanted punctuation
set @title1 = replace(replace(replace(@title1,@S+@S,@S),@S+@S,@S),@S+@S,@S); -- replace any multiple seps created
set @title2 = replace(replace(replace(@title2,@S+@S,@S),@S+@S,@S),@S+@S,@S); -- replace any multiple seps created
set @title1 = ltrim(rtrim(lower(@title1))) + @S; -- split char(s) on end ensures we get last item
set @title2 = ltrim(rtrim(lower(@title2))) + @S; -- split char(s) on end ensures we get last item
--## then split both strings into their tables
set @Counter = 1;
WHILE ( CHARINDEX(@S, @title1) > 0 )
BEGIN
INSERT INTO @words1 (word) SELECT LTRIM(RTRIM(SUBSTRING(@title1, 1, CHARINDEX(@S, @title1) - 1)));
SET @title1 = SUBSTRING(@title1, CHARINDEX(@S, @title1) + 1, LEN(@title1));
SET @Counter = @Counter + 1;
END;
set @Counter = 1;
WHILE ( CHARINDEX(@S, @title2) > 0 )
BEGIN
INSERT INTO @words2 (word) SELECT LTRIM(RTRIM(SUBSTRING(@title2, 1, CHARINDEX(@S, @title2) - 1)));
SET @title2 = SUBSTRING(@title2, CHARINDEX(@S, @title2) + 1, LEN(@title2));
SET @Counter = @Counter + 1;
END;
--## delete non reqd words
delete from @words1 where word in ('&', 'a', 'and', 'at', 'by', 'can', 'for', 'if', 'in', 'is', 'it', 'of', 'on', 'or', 'the', 'this', 'to', 'too', 'verizon', 'with', 'your');
delete from @words2 where word in ('&', 'a', 'and', 'at', 'by', 'can', 'for', 'if', 'in', 'is', 'it', 'of', 'on', 'or', 'the', 'this', 'to', 'too', 'verizon', 'with', 'your');
--## union tables and count reqd words OR just count words in title1 and don't double final percentage to gain a better match
--select @totalwordcount = count(*) from (select * from @words1 union all select * from @words2) x ;
-- set @mul = 2;
select @totalwordcount = count(*) from @words1;
--print 'totalwordcount:' + cast(@totalwordcount as varchar(10));
select @temp = @temp + '|' + word from (select * from @words1 union all select * from @words2) x ;
--print @temp;
--## join tables and count result and count words in the two tables
select @pcm = count(*) from @words1 w1 JOIN @words2 w2 on w1.word = w2.word;
--print 'joins:' + cast(@pcm as varchar(10));
--## calculate the pc and return
set @pcm = ((@pcm * 100)/@totalwordcount) * @mul;
--print 'pc is:' + cast(@pcm as varchar(10));
-- Return the result of the function
RETURN @pcm;
END
无论如何,即使没有人能提出任何改进,我想至少这段代码可以供有类似需求但不想使用 FTS 或任何第三方库的任何其他人使用。
你的做法很"row based"。这是一个基于集合的方法,更少的代码,更好的维护和更快的...
DECLARE @forbiddenWords TABLE(item VARCHAR(100));
INSERT INTO @forbiddenWords VALUES ('&'),( 'a'),( 'and'),( 'at'),( 'by'),( 'can'),( 'for'),( 'if'),( 'in'),( 'is'),( 'it'),( 'of'),( 'on'),( 'or'),( 'the'),( 'this'),( 'to'),( 'too'),( 'verizon'),( 'with'),( 'your')
DECLARE @breakingCharacters TABLE(item VARCHAR(100));
INSERT INTO @breakingCharacters VALUES(':'),(';'),(','),('!'),('-'),('?'),('.'),('%'),('$'),('&'),('£'),('"');
DECLARE @Phrase1 VARCHAR(MAX)='This is a text where I try to find similar words. Let''s see if it works!';
DECLARE @Phrase2 VARCHAR(MAX)='This is another text where I use some words of Phrase1 to check their similarity!';
--Replace all breaking Characters
SELECT @Phrase1=REPLACE(@Phrase1,item,' ')
FROM @breakingCharacters;
SELECT @Phrase2=REPLACE(@Phrase2,item,' ')
FROM @breakingCharacters;
WITH Splitted AS
(
SELECT CAST('<x>' + REPLACE(LOWER(@Phrase1),' ','</x><x>') + '</x>' AS xml) AS Phrase1AsXml
,CAST('<x>' + REPLACE(LOWER(@Phrase2),' ','</x><x>') + '</x>' AS xml) AS Phrase2AsXml
)
,Phrase1AsFilteredWords AS
(
SELECT DISTINCT The.word.value('.','varchar(max)') AS OneWord
FROM Splitted
CROSS APPLY Phrase1AsXml.nodes('/x') AS The(word)
WHERE LEN(The.word.value('.','varchar(max)'))>0
AND NOT EXISTS(SELECT * FROM @forbiddenWords AS fw WHERE fw.item = The.word.value('.','varchar(max)') )
)
,Phrase2AsFilteredWords AS
(
SELECT DISTINCT The.word.value('.','varchar(max)') AS OneWord
FROM Splitted
CROSS APPLY Phrase2AsXml.nodes('/x') AS The(word)
WHERE LEN(The.word.value('.','varchar(max)'))>0
AND NOT EXISTS(SELECT * FROM @forbiddenWords AS fw WHERE fw.item = The.word.value('.','varchar(max)') )
)
,CommonWords AS
(
SELECT p1.OneWord
FROM Phrase1AsFilteredWords AS p1
INNER JOIN Phrase2AsFilteredWords AS p2 ON p1.OneWord=p2.OneWord
)
,WordCounter AS
(
SELECT
(SELECT COUNT(*) FROM Phrase1AsFilteredWords) AS CountPhrase1
,(SELECT COUNT(*) FROM Phrase2AsFilteredWords) AS CountPhrase2
,(SELECT COUNT(*) FROM CommonWords) AS CountCommon
)
SELECT WordCounter.*
,(CountCommon*100) / CountPhrase1 AS Phrase1PC
,(CountCommon*100) / CountPhrase2 AS Phrase2PC
,STUFF((
SELECT ', ' + OneWord
FROM CommonWords
FOR XML PATH('')
),1,2,'') AS CommonWords
FROM WordCounter
结果:
CountPhrase1 CountPhrase2 CountCommon Phrase1PC Phrase2PC CommonWords
10 11 4 40 36 i, text, where, words
一个提示:如果你将许多与许多进行比较,那么一次又一次地计算会花费很多。我建议你一次准备好所有的短语,然后比较这些准备好的结果...
还有一个提示:如果你经常这样做并且你的短语没有改变,那么永久存储准备好的单词列表可能会很聪明。
编码愉快!
我检查了类似的问题,但我不想使用全文搜索,也没有找到任何其他符合我需求的东西,所以我在下面写了这段代码 returns 名义上的一对字符串的单词匹配百分比。
它的工作原理是删除 "noise" 个单词和标点符号,然后将所有剩余的单词放入一个 table 变量(两个字符串各一个),然后将两个 table s 并计算连接数。然后将连接数除以其中一个字符串的字数,得到一个(标称的)百分比。它工作正常,但是当我在 select 中使用它从几千个字符串中识别 5 个最佳匹配时它有点慢!
所以我需要更好的方法或任何好的想法来提高速度...
CREATE FUNCTION [GLOBAL].[COMPARETITLESFUNC]
(
@title1 nvarchar(4000), @title2 nvarchar(4000)-- , @pcmatch int output
)
returns int
AS
BEGIN
DECLARE @pcm int, @matchedrows int, @totalwordcount int, @Counter int, @mul int;
DECLARE @words1 table (word nvarchar(500));
DECLARE @words2 table (word nvarchar(500));
DECLARE @S nvarchar(10);
DECLARE @temp nvarchar(4000);
set @pcm = 0;
set @matchedrows = 0;
set @totalwordcount = 0;
set @S = ' '; -- split char may be replace with other strings
set @temp = '';
set @mul = 1;
-- build two temp tables of the words from each string excluding common/noise words
-- words to exclude: in on the and verizon can too if it a at of or for by your with &
-- maybe one day store exclusion words in a codelist called compexclude if maintenance of them becomes an issue
-- join the tables and count the rows returned and divide by the total number of words in the two strings
--## first convert all strings to lower case and remove any extraneous spaces and any other punctuation
set @title1 = replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(@title1,':',@S),';',@S),',',@S),'!',@S),'-',@S),'?',@S),'.',@S),'%',@S),'$',@S),'&',@S),'£',@S),'"',@S) -- remove unwanted punctuation
set @title2 = replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(@title2,':',@S),';',@S),',',@S),'!',@S),'-',@S),'?',@S),'.',@S),'%',@S),'$',@S),'&',@S),'£',@S),'"',@S) -- remove unwanted punctuation
set @title1 = replace(replace(replace(@title1,@S+@S,@S),@S+@S,@S),@S+@S,@S); -- replace any multiple seps created
set @title2 = replace(replace(replace(@title2,@S+@S,@S),@S+@S,@S),@S+@S,@S); -- replace any multiple seps created
set @title1 = ltrim(rtrim(lower(@title1))) + @S; -- split char(s) on end ensures we get last item
set @title2 = ltrim(rtrim(lower(@title2))) + @S; -- split char(s) on end ensures we get last item
--## then split both strings into their tables
set @Counter = 1;
WHILE ( CHARINDEX(@S, @title1) > 0 )
BEGIN
INSERT INTO @words1 (word) SELECT LTRIM(RTRIM(SUBSTRING(@title1, 1, CHARINDEX(@S, @title1) - 1)));
SET @title1 = SUBSTRING(@title1, CHARINDEX(@S, @title1) + 1, LEN(@title1));
SET @Counter = @Counter + 1;
END;
set @Counter = 1;
WHILE ( CHARINDEX(@S, @title2) > 0 )
BEGIN
INSERT INTO @words2 (word) SELECT LTRIM(RTRIM(SUBSTRING(@title2, 1, CHARINDEX(@S, @title2) - 1)));
SET @title2 = SUBSTRING(@title2, CHARINDEX(@S, @title2) + 1, LEN(@title2));
SET @Counter = @Counter + 1;
END;
--## delete non reqd words
delete from @words1 where word in ('&', 'a', 'and', 'at', 'by', 'can', 'for', 'if', 'in', 'is', 'it', 'of', 'on', 'or', 'the', 'this', 'to', 'too', 'verizon', 'with', 'your');
delete from @words2 where word in ('&', 'a', 'and', 'at', 'by', 'can', 'for', 'if', 'in', 'is', 'it', 'of', 'on', 'or', 'the', 'this', 'to', 'too', 'verizon', 'with', 'your');
--## union tables and count reqd words OR just count words in title1 and don't double final percentage to gain a better match
--select @totalwordcount = count(*) from (select * from @words1 union all select * from @words2) x ;
-- set @mul = 2;
select @totalwordcount = count(*) from @words1;
--print 'totalwordcount:' + cast(@totalwordcount as varchar(10));
select @temp = @temp + '|' + word from (select * from @words1 union all select * from @words2) x ;
--print @temp;
--## join tables and count result and count words in the two tables
select @pcm = count(*) from @words1 w1 JOIN @words2 w2 on w1.word = w2.word;
--print 'joins:' + cast(@pcm as varchar(10));
--## calculate the pc and return
set @pcm = ((@pcm * 100)/@totalwordcount) * @mul;
--print 'pc is:' + cast(@pcm as varchar(10));
-- Return the result of the function
RETURN @pcm;
END
无论如何,即使没有人能提出任何改进,我想至少这段代码可以供有类似需求但不想使用 FTS 或任何第三方库的任何其他人使用。
你的做法很"row based"。这是一个基于集合的方法,更少的代码,更好的维护和更快的...
DECLARE @forbiddenWords TABLE(item VARCHAR(100));
INSERT INTO @forbiddenWords VALUES ('&'),( 'a'),( 'and'),( 'at'),( 'by'),( 'can'),( 'for'),( 'if'),( 'in'),( 'is'),( 'it'),( 'of'),( 'on'),( 'or'),( 'the'),( 'this'),( 'to'),( 'too'),( 'verizon'),( 'with'),( 'your')
DECLARE @breakingCharacters TABLE(item VARCHAR(100));
INSERT INTO @breakingCharacters VALUES(':'),(';'),(','),('!'),('-'),('?'),('.'),('%'),('$'),('&'),('£'),('"');
DECLARE @Phrase1 VARCHAR(MAX)='This is a text where I try to find similar words. Let''s see if it works!';
DECLARE @Phrase2 VARCHAR(MAX)='This is another text where I use some words of Phrase1 to check their similarity!';
--Replace all breaking Characters
SELECT @Phrase1=REPLACE(@Phrase1,item,' ')
FROM @breakingCharacters;
SELECT @Phrase2=REPLACE(@Phrase2,item,' ')
FROM @breakingCharacters;
WITH Splitted AS
(
SELECT CAST('<x>' + REPLACE(LOWER(@Phrase1),' ','</x><x>') + '</x>' AS xml) AS Phrase1AsXml
,CAST('<x>' + REPLACE(LOWER(@Phrase2),' ','</x><x>') + '</x>' AS xml) AS Phrase2AsXml
)
,Phrase1AsFilteredWords AS
(
SELECT DISTINCT The.word.value('.','varchar(max)') AS OneWord
FROM Splitted
CROSS APPLY Phrase1AsXml.nodes('/x') AS The(word)
WHERE LEN(The.word.value('.','varchar(max)'))>0
AND NOT EXISTS(SELECT * FROM @forbiddenWords AS fw WHERE fw.item = The.word.value('.','varchar(max)') )
)
,Phrase2AsFilteredWords AS
(
SELECT DISTINCT The.word.value('.','varchar(max)') AS OneWord
FROM Splitted
CROSS APPLY Phrase2AsXml.nodes('/x') AS The(word)
WHERE LEN(The.word.value('.','varchar(max)'))>0
AND NOT EXISTS(SELECT * FROM @forbiddenWords AS fw WHERE fw.item = The.word.value('.','varchar(max)') )
)
,CommonWords AS
(
SELECT p1.OneWord
FROM Phrase1AsFilteredWords AS p1
INNER JOIN Phrase2AsFilteredWords AS p2 ON p1.OneWord=p2.OneWord
)
,WordCounter AS
(
SELECT
(SELECT COUNT(*) FROM Phrase1AsFilteredWords) AS CountPhrase1
,(SELECT COUNT(*) FROM Phrase2AsFilteredWords) AS CountPhrase2
,(SELECT COUNT(*) FROM CommonWords) AS CountCommon
)
SELECT WordCounter.*
,(CountCommon*100) / CountPhrase1 AS Phrase1PC
,(CountCommon*100) / CountPhrase2 AS Phrase2PC
,STUFF((
SELECT ', ' + OneWord
FROM CommonWords
FOR XML PATH('')
),1,2,'') AS CommonWords
FROM WordCounter
结果:
CountPhrase1 CountPhrase2 CountCommon Phrase1PC Phrase2PC CommonWords
10 11 4 40 36 i, text, where, words
一个提示:如果你将许多与许多进行比较,那么一次又一次地计算会花费很多。我建议你一次准备好所有的短语,然后比较这些准备好的结果...
还有一个提示:如果你经常这样做并且你的短语没有改变,那么永久存储准备好的单词列表可能会很聪明。
编码愉快!