比较 2 个文本列并使用 sql 在第三个单元格中显示差异
compare 2 text columns and show difference in the third cell using sql
我正在尝试比较 2 列,例如我必须得到唯一的区别
select * from table1
Column_1 column_2
---------------- ------------------
Swetha working Swetha is working in Chennai
Raju 10th Raju is studying 10th std
ranjith Ranjith played yesterday
how to play how to play Cricket
My name is my name is john
输出:
如果单词介于两者之间,它也应该像第 1 行和第 2 行一样删除
Column_1 column_2 column_3
---------------- ------------------ ------------------------
Swetha working Swetha is working in Chennai is in Chennai
Raju 10th Raju is studying 10th std is studying std
ranjith Ranjith played yesterday played yesterday
how to play how to play Cricket Cricket
My name is my name is john john
这比你之前的问题复杂多了。您可以将第一列分解为单词,然后在第二列中单独替换它们。不过,要做到这一点,您需要一个递归 CTE:
with words as (
select t.*, s.*,
max(s.seqnum) over (partition by t.id) as max_seqnum
from t cross apply
(select s.value as word,
row_number() over (order by (select null)) as seqnum
from string_split(col1, ' ') s
) s
),
cte as (
select id, col1, col2,
replace(' ' + col2 + ' ', ' ' + word + ' ', ' ') as result,
word, seqnum, max_seqnum
from words
where seqnum = 1
union all
select cte.id, cte.col1, cte.col2,
replace(cte.result, ' ' + w.word + ' ', ' '),
w.word, w.seqnum, cte.max_seqnum
from cte join
words w
on w.id = cte.id and w.seqnum = cte.seqnum + 1
)
select id, col1, col2, ltrim(rtrim(result)) as result
from cte
where max_seqnum = seqnum
order by id;
Here 是一个 db<>fiddle.
我添加了一个 id
所以每一行都是唯一定义的。如果您的 SQL Server 版本没有 built-in string_split()
功能,您可以轻松找到具有相同功能的版本。
它使用的一个技巧是处理第二列中的第一个词和最后一个词。该代码在开头和结尾添加了空格。这样,字符串中的所有单词都被空格包围,从而更容易只替换完整的单词。
SQL 2016肯定有字符串拆分。此方法将额外的 space 附加到第 2 列的拆分词的任一侧。
数据
drop table if exists #strings;
go
create table #strings(
Id int,
Column_1 varchar(200),
Column_2 varchar(200));
go
insert #strings(Id, Column_1, Column_2) values
(1, 'Swetha', 'Swetha is working in Chennai'),
(2, 'Raju', 'Raju is studying 10 std'),
(3, 'Swetha working', 'Swetha is working in Chennai'),
(4, 'Raju 10th', 'Raju is studying 10th std');
查询
declare
@add_delim char(1)=' ';
;with
c1_cte(split_str) as (
select ltrim(rtrim(s.[value]))
from
#strings st
cross apply
string_split(st.Column_1, ' ') s),
c2_cte(Id, ndx, split_str) as (
select Id, charindex(@add_delim + s.[value] + @add_delim, @add_delim + st.Column_2 + @add_delim), s.[value]
from
#strings st
cross apply
string_split(st.Column_2, ' ') s
where
st.Column_2 not like '% %')
select
Id, stuff((select ' ' + c.split_str
from c2_cte c
where c.Id = c2.Id and not exists(select 1
from c1_cte c1
where c.split_str=c1.split_str)
order by c.ndx FOR XML PATH('')), 1, 1, '') [new_str]
from c2_cte c2
group by Id;
结果
Id new_str
1 is in Chennai
2 is studying 10 std
3 is in Chennai
4 is studying std
这是使用 STRING_SPLIT
和 STRING_AGG
的解决方案
DBFIDDLE 工作 link
;WITH split_words
AS (
SELECT *
FROM dbo.Strings
CROSS APPLY (
SELECT VALUE
FROM STRING_SPLIT(column_2, ' ')
WHERE VALUE NOT IN (
SELECT VALUE
FROM STRING_SPLIT(column_1, ' ')
)
) a
)
SELECT *
,(
SELECT sw.VALUE + ' ' [text()]
FROM split_words sw
WHERE sw.Column_1 = s.Column_1
AND sw.Column_2 = s.Column_2
FOR XML PATH('')
,TYPE
).value('.', 'NVARCHAR(MAX)') [difference]
FROM dbo.Strings s
对于 SQL 版本 2017+,其中支持 STRING_AGG
SELECT b.Column_1
,b.Column_2
,STRING_AGG(b.VALUE, ' ')
FROM (
SELECT *
FROM dbo.Strings
CROSS APPLY (
SELECT VALUE
FROM STRING_SPLIT(column_2, ' ')
WHERE VALUE NOT IN (
SELECT VALUE
FROM STRING_SPLIT(column_1, ' ')
)
) a
) b
GROUP BY b.Column_1
,b.Column_2
结果:
WITH
-- your input
input(column_1,column_2,column_3) AS (
SELECT 'Swetha working','Swetha is working in Chennai','is in Chennai'
UNION ALL SELECT 'Raju 10th','Raju is studying 10th std','is studying std'
UNION ALL SELECT 'ranjith','Rantith played yesterday','played yesterday'
UNION ALL SELECT 'how to play','how to play Cricket','Cricket'
UNION ALL SELECT 'My name is','my name is john','john'
)
,
-- need a series of integers
-- you can also try to play with the STRING_SPLIT() function
i(i) AS (
SELECT 1
UNION ALL SELECT 2
UNION ALL SELECT 3
UNION ALL SELECT 4
UNION ALL SELECT 5
)
,
-- you can also try to play with the STRING_SPLIT() function
unfound_tokens AS (
SELECT
i
, column_1
, column_2
, TOKEN(column_2,' ',i) AS token
FROM input CROSS JOIN i
WHERE TOKEN(column_2,' ',i) <> ''
AND CHARINDEX(
UPPER(TOKEN(column_2,' ',i))
, UPPER(column_1)
) = 0
)
SELECT
column_1
, column_2
, STRING_AGG(token ,' ') AS column_3
FROM unfound_tokens
GROUP BY
column_1
, column_2
-- out column_1 | column_2 | column_3
-- out ----------------+------------------------------+--------------------------
-- out My name is | my name is john | john
-- out Swetha working | Swetha is working in Chennai | is Chennai
-- out how to play | how to play Cricket | Cricket
-- out Raju 10th | Raju is studying 10th std | is studying std
-- out ranjith | Rantith played yesterday | Rantith played yesterday
我不确定使用 STRING_AGG 或 STRING_SPLIT 的结果是否会保留单词的顺序...
只需查看给出不同排序的查询:
WITH
SS1 AS
(SELECT Id, SS.value AS COL1
FROM #strings
CROSS APPLY STRING_SPLIT(Column_1, ' ') AS SS
),
SS2 AS
(SELECT Id, SS.value AS COL2
FROM #strings
CROSS APPLY STRING_SPLIT(Column_2, ' ') AS SS
),
DIF AS
(
SELECT Id, COL2 AS COL
FROM SS2
EXCEPT
SELECT Id, COL1
FROM SS1
)
SELECT DIF.Id, Column_1, Column_2, STRING_AGG(COL, ' ')
FROM DIF
JOIN #strings AS S ON S.Id = DIF.Id
GROUP BY DIF.Id, Column_1, Column_2;
您必须尝试使用非常大量的数据来查看已给出的查询是否会产生像排序不一致这样的副作用(我很确定由于并行性不会出现一致的顺序。 ...)
因此,保持一致排序的唯一方法是创建一个递归查询,在句子中添加单词的指示值...
我正在尝试比较 2 列,例如我必须得到唯一的区别
select * from table1
Column_1 column_2
---------------- ------------------
Swetha working Swetha is working in Chennai
Raju 10th Raju is studying 10th std
ranjith Ranjith played yesterday
how to play how to play Cricket
My name is my name is john
输出:
如果单词介于两者之间,它也应该像第 1 行和第 2 行一样删除
Column_1 column_2 column_3
---------------- ------------------ ------------------------
Swetha working Swetha is working in Chennai is in Chennai
Raju 10th Raju is studying 10th std is studying std
ranjith Ranjith played yesterday played yesterday
how to play how to play Cricket Cricket
My name is my name is john john
这比你之前的问题复杂多了。您可以将第一列分解为单词,然后在第二列中单独替换它们。不过,要做到这一点,您需要一个递归 CTE:
with words as (
select t.*, s.*,
max(s.seqnum) over (partition by t.id) as max_seqnum
from t cross apply
(select s.value as word,
row_number() over (order by (select null)) as seqnum
from string_split(col1, ' ') s
) s
),
cte as (
select id, col1, col2,
replace(' ' + col2 + ' ', ' ' + word + ' ', ' ') as result,
word, seqnum, max_seqnum
from words
where seqnum = 1
union all
select cte.id, cte.col1, cte.col2,
replace(cte.result, ' ' + w.word + ' ', ' '),
w.word, w.seqnum, cte.max_seqnum
from cte join
words w
on w.id = cte.id and w.seqnum = cte.seqnum + 1
)
select id, col1, col2, ltrim(rtrim(result)) as result
from cte
where max_seqnum = seqnum
order by id;
Here 是一个 db<>fiddle.
我添加了一个 id
所以每一行都是唯一定义的。如果您的 SQL Server 版本没有 built-in string_split()
功能,您可以轻松找到具有相同功能的版本。
它使用的一个技巧是处理第二列中的第一个词和最后一个词。该代码在开头和结尾添加了空格。这样,字符串中的所有单词都被空格包围,从而更容易只替换完整的单词。
SQL 2016肯定有字符串拆分。此方法将额外的 space 附加到第 2 列的拆分词的任一侧。
数据
drop table if exists #strings;
go
create table #strings(
Id int,
Column_1 varchar(200),
Column_2 varchar(200));
go
insert #strings(Id, Column_1, Column_2) values
(1, 'Swetha', 'Swetha is working in Chennai'),
(2, 'Raju', 'Raju is studying 10 std'),
(3, 'Swetha working', 'Swetha is working in Chennai'),
(4, 'Raju 10th', 'Raju is studying 10th std');
查询
declare
@add_delim char(1)=' ';
;with
c1_cte(split_str) as (
select ltrim(rtrim(s.[value]))
from
#strings st
cross apply
string_split(st.Column_1, ' ') s),
c2_cte(Id, ndx, split_str) as (
select Id, charindex(@add_delim + s.[value] + @add_delim, @add_delim + st.Column_2 + @add_delim), s.[value]
from
#strings st
cross apply
string_split(st.Column_2, ' ') s
where
st.Column_2 not like '% %')
select
Id, stuff((select ' ' + c.split_str
from c2_cte c
where c.Id = c2.Id and not exists(select 1
from c1_cte c1
where c.split_str=c1.split_str)
order by c.ndx FOR XML PATH('')), 1, 1, '') [new_str]
from c2_cte c2
group by Id;
结果
Id new_str
1 is in Chennai
2 is studying 10 std
3 is in Chennai
4 is studying std
这是使用 STRING_SPLIT
和 STRING_AGG
DBFIDDLE 工作 link
;WITH split_words
AS (
SELECT *
FROM dbo.Strings
CROSS APPLY (
SELECT VALUE
FROM STRING_SPLIT(column_2, ' ')
WHERE VALUE NOT IN (
SELECT VALUE
FROM STRING_SPLIT(column_1, ' ')
)
) a
)
SELECT *
,(
SELECT sw.VALUE + ' ' [text()]
FROM split_words sw
WHERE sw.Column_1 = s.Column_1
AND sw.Column_2 = s.Column_2
FOR XML PATH('')
,TYPE
).value('.', 'NVARCHAR(MAX)') [difference]
FROM dbo.Strings s
对于 SQL 版本 2017+,其中支持 STRING_AGG
SELECT b.Column_1
,b.Column_2
,STRING_AGG(b.VALUE, ' ')
FROM (
SELECT *
FROM dbo.Strings
CROSS APPLY (
SELECT VALUE
FROM STRING_SPLIT(column_2, ' ')
WHERE VALUE NOT IN (
SELECT VALUE
FROM STRING_SPLIT(column_1, ' ')
)
) a
) b
GROUP BY b.Column_1
,b.Column_2
结果:
WITH
-- your input
input(column_1,column_2,column_3) AS (
SELECT 'Swetha working','Swetha is working in Chennai','is in Chennai'
UNION ALL SELECT 'Raju 10th','Raju is studying 10th std','is studying std'
UNION ALL SELECT 'ranjith','Rantith played yesterday','played yesterday'
UNION ALL SELECT 'how to play','how to play Cricket','Cricket'
UNION ALL SELECT 'My name is','my name is john','john'
)
,
-- need a series of integers
-- you can also try to play with the STRING_SPLIT() function
i(i) AS (
SELECT 1
UNION ALL SELECT 2
UNION ALL SELECT 3
UNION ALL SELECT 4
UNION ALL SELECT 5
)
,
-- you can also try to play with the STRING_SPLIT() function
unfound_tokens AS (
SELECT
i
, column_1
, column_2
, TOKEN(column_2,' ',i) AS token
FROM input CROSS JOIN i
WHERE TOKEN(column_2,' ',i) <> ''
AND CHARINDEX(
UPPER(TOKEN(column_2,' ',i))
, UPPER(column_1)
) = 0
)
SELECT
column_1
, column_2
, STRING_AGG(token ,' ') AS column_3
FROM unfound_tokens
GROUP BY
column_1
, column_2
-- out column_1 | column_2 | column_3
-- out ----------------+------------------------------+--------------------------
-- out My name is | my name is john | john
-- out Swetha working | Swetha is working in Chennai | is Chennai
-- out how to play | how to play Cricket | Cricket
-- out Raju 10th | Raju is studying 10th std | is studying std
-- out ranjith | Rantith played yesterday | Rantith played yesterday
我不确定使用 STRING_AGG 或 STRING_SPLIT 的结果是否会保留单词的顺序...
只需查看给出不同排序的查询:
WITH
SS1 AS
(SELECT Id, SS.value AS COL1
FROM #strings
CROSS APPLY STRING_SPLIT(Column_1, ' ') AS SS
),
SS2 AS
(SELECT Id, SS.value AS COL2
FROM #strings
CROSS APPLY STRING_SPLIT(Column_2, ' ') AS SS
),
DIF AS
(
SELECT Id, COL2 AS COL
FROM SS2
EXCEPT
SELECT Id, COL1
FROM SS1
)
SELECT DIF.Id, Column_1, Column_2, STRING_AGG(COL, ' ')
FROM DIF
JOIN #strings AS S ON S.Id = DIF.Id
GROUP BY DIF.Id, Column_1, Column_2;
您必须尝试使用非常大量的数据来查看已给出的查询是否会产生像排序不一致这样的副作用(我很确定由于并行性不会出现一致的顺序。 ...)
因此,保持一致排序的唯一方法是创建一个递归查询,在句子中添加单词的指示值...