SQL 去除最长公共前缀
SQL Strip Longest Common Prefix
我有一个 table tbl1
有两列 col1
和 col2
包含字符串:
col1 | col2
--------+--------
bar | foo
foo | foobar
bar1foo | bar2foo
对应SQL转储:
CREATE TABLE `tbl1` (
`col1` varchar(20) COLLATE latin1_general_ci NOT NULL,
`col2` varchar(20) COLLATE latin1_general_ci NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci;
INSERT INTO `tbl1` (`col1`, `col2`) VALUES
('bar', 'foo'),
('foo', 'foobar'),
('bar1foo', 'bar2foo');
在大多数情况下,条目的字符串共享一个公共前缀。我需要一个去除那些常见前缀的查询。预期结果:
bar | foo
| bar
1foo | 2foo
到目前为止我的方法:
SELECT
SUBSTR(`col1`, 1+GREATEST(LENGTH(`col1`), LENGTH(`col2`)) - CEIL(LENGTH(TRIM(TRAILING '0' FROM HEX(ABS(CONV(HEX(REVERSE(`col1`)),16,10) - CONV(HEX(REVERSE(`col2`)),16,10)))))/2)),
SUBSTR(`col2`, 1+GREATEST(LENGTH(`col1`), LENGTH(`col2`)) - CEIL(LENGTH(TRIM(TRAILING '0' FROM HEX(ABS(CONV(HEX(REVERSE(`col1`)),16,10) - CONV(HEX(REVERSE(`col2`)),16,10)))))/2))
FROM tbl1
简略解释:字符串反转(REVERSE
),转化为整数(HEX
和CONV
),相互相减(-
和ABS
), 转成十六进制表示(HEX
), 0
的从末尾截去(TRIM
), 这个结果的长度减去最长的长度字符串(-
、LENGTH
和 GREATEST
),然后由 SUBSTR
使用以获得结果。
我的方法有问题:
- 不适用于长度超过 64 位的字符串。
- 不适用于包含多字节字符的字符串
- 很长很丑
- 没有很好的表现。
此代码有效,尽管它 冗长且丑陋 并且(可能)性能不佳:
select
substring(t.col1, g.maxlen + 1) col1,
substring(t.col2, g.maxlen + 1) col2
from tbl1 t inner join (
select t.col1, t.col2,
max(case when left(col1, tt.n) = left(col2, tt.n) then tt.n else 0 end) maxlen
from tbl1 t inner join (
select 1 n union all select 2 union all select 3 union all select 4 union all
select 5 union all select 6 union all select 7 union all select 8 union all
select 9 union all select 10 union all select 11 union all select 12 union all
select 13 union all select 14 union all select 15 union all select 16 union all
select 17 union all select 18 union all select 19 union all select 20
) tt on least(length(t.col1), length(t.col2)) >= tt.n
group by t.col1, t.col2
) g on g.col1 = t.col1 and g.col2 = t.col2
参见demo。
对于 MySql 8.0+,您可以使用 recursive CTE
,在这种情况下,不需要先验了解列的长度:
with
recursive lengths as (
select 1 n
union all
select n + 1
from lengths
where n < (select max(least(length(col1), length(col2))) from tbl1)
),
cte as (
select t.col1, t.col2,
max(case when left(col1, l.n) = left(col2, l.n) then l.n else 0 end) maxlen
from tbl1 t inner join lengths l
on least(length(t.col1), length(t.col2)) >= l.n
group by t.col1, t.col2
)
select
substring(t.col1, c.maxlen + 1) col1,
substring(t.col2, c.maxlen + 1) col2
from tbl1 t inner join cte c
on c.col1 = t.col1 and c.col2 = t.col2
参见demo。
结果:
| col1 | col2 |
| ---- | ---- |
| | bar |
| bar | foo |
| 1foo | 2foo |
可悲的是,最通用和性能最好的方法可能是一个巨大的 case
表达式。但是,这只适用于一定长度:
select substr(col1, prefix_length + 1),
substr(col2, prefix_length + 1)
from (select tbl1.*,
(case when left(col1, 10) = left(col2, 10) then 10
when left(col1, 9) = left(col2, 9) then 9
. . .
else 0
end) as prefix_length
from tbl1
) t;
实际上,您可以使用递归 CTE 来完成此操作,这是最通用的方法:
with recursive cte as (
select col1, col2, 1 as lev, col1 as orig_col1, col2 as orig_col2
from tbl1
union all
select substr(col1, 2), substr(col2, 2), lev + 1, orig_col1, orig_col2
from cte
where left(col1, 1) = left(col2, 1)
)
select col1, col2
from (select cte.*,
dense_rank() over (partition by orig_col1, orig_col2 order by lev desc) as seqnum
from cte
) x
where seqnum = 1;
尽管性能肯定会比您的解决方案或庞大的 case
表达式差,但它可能还不错,您可能会发现它足以满足您的目的。
Here 是具有两种解决方案的 db<>fiddle。
我有一个 table tbl1
有两列 col1
和 col2
包含字符串:
col1 | col2
--------+--------
bar | foo
foo | foobar
bar1foo | bar2foo
对应SQL转储:
CREATE TABLE `tbl1` (
`col1` varchar(20) COLLATE latin1_general_ci NOT NULL,
`col2` varchar(20) COLLATE latin1_general_ci NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1 COLLATE=latin1_general_ci;
INSERT INTO `tbl1` (`col1`, `col2`) VALUES
('bar', 'foo'),
('foo', 'foobar'),
('bar1foo', 'bar2foo');
在大多数情况下,条目的字符串共享一个公共前缀。我需要一个去除那些常见前缀的查询。预期结果:
bar | foo
| bar
1foo | 2foo
到目前为止我的方法:
SELECT
SUBSTR(`col1`, 1+GREATEST(LENGTH(`col1`), LENGTH(`col2`)) - CEIL(LENGTH(TRIM(TRAILING '0' FROM HEX(ABS(CONV(HEX(REVERSE(`col1`)),16,10) - CONV(HEX(REVERSE(`col2`)),16,10)))))/2)),
SUBSTR(`col2`, 1+GREATEST(LENGTH(`col1`), LENGTH(`col2`)) - CEIL(LENGTH(TRIM(TRAILING '0' FROM HEX(ABS(CONV(HEX(REVERSE(`col1`)),16,10) - CONV(HEX(REVERSE(`col2`)),16,10)))))/2))
FROM tbl1
简略解释:字符串反转(REVERSE
),转化为整数(HEX
和CONV
),相互相减(-
和ABS
), 转成十六进制表示(HEX
), 0
的从末尾截去(TRIM
), 这个结果的长度减去最长的长度字符串(-
、LENGTH
和 GREATEST
),然后由 SUBSTR
使用以获得结果。
我的方法有问题:
- 不适用于长度超过 64 位的字符串。
- 不适用于包含多字节字符的字符串
- 很长很丑
- 没有很好的表现。
此代码有效,尽管它 冗长且丑陋 并且(可能)性能不佳:
select
substring(t.col1, g.maxlen + 1) col1,
substring(t.col2, g.maxlen + 1) col2
from tbl1 t inner join (
select t.col1, t.col2,
max(case when left(col1, tt.n) = left(col2, tt.n) then tt.n else 0 end) maxlen
from tbl1 t inner join (
select 1 n union all select 2 union all select 3 union all select 4 union all
select 5 union all select 6 union all select 7 union all select 8 union all
select 9 union all select 10 union all select 11 union all select 12 union all
select 13 union all select 14 union all select 15 union all select 16 union all
select 17 union all select 18 union all select 19 union all select 20
) tt on least(length(t.col1), length(t.col2)) >= tt.n
group by t.col1, t.col2
) g on g.col1 = t.col1 and g.col2 = t.col2
参见demo。
对于 MySql 8.0+,您可以使用 recursive CTE
,在这种情况下,不需要先验了解列的长度:
with
recursive lengths as (
select 1 n
union all
select n + 1
from lengths
where n < (select max(least(length(col1), length(col2))) from tbl1)
),
cte as (
select t.col1, t.col2,
max(case when left(col1, l.n) = left(col2, l.n) then l.n else 0 end) maxlen
from tbl1 t inner join lengths l
on least(length(t.col1), length(t.col2)) >= l.n
group by t.col1, t.col2
)
select
substring(t.col1, c.maxlen + 1) col1,
substring(t.col2, c.maxlen + 1) col2
from tbl1 t inner join cte c
on c.col1 = t.col1 and c.col2 = t.col2
参见demo。
结果:
| col1 | col2 |
| ---- | ---- |
| | bar |
| bar | foo |
| 1foo | 2foo |
可悲的是,最通用和性能最好的方法可能是一个巨大的 case
表达式。但是,这只适用于一定长度:
select substr(col1, prefix_length + 1),
substr(col2, prefix_length + 1)
from (select tbl1.*,
(case when left(col1, 10) = left(col2, 10) then 10
when left(col1, 9) = left(col2, 9) then 9
. . .
else 0
end) as prefix_length
from tbl1
) t;
实际上,您可以使用递归 CTE 来完成此操作,这是最通用的方法:
with recursive cte as (
select col1, col2, 1 as lev, col1 as orig_col1, col2 as orig_col2
from tbl1
union all
select substr(col1, 2), substr(col2, 2), lev + 1, orig_col1, orig_col2
from cte
where left(col1, 1) = left(col2, 1)
)
select col1, col2
from (select cte.*,
dense_rank() over (partition by orig_col1, orig_col2 order by lev desc) as seqnum
from cte
) x
where seqnum = 1;
尽管性能肯定会比您的解决方案或庞大的 case
表达式差,但它可能还不错,您可能会发现它足以满足您的目的。
Here 是具有两种解决方案的 db<>fiddle。