SQL 分组 interescting/overlapping 行
SQL grouping interescting/overlapping rows
我在 Postgres 中有以下 table 两列 a_sno
和 b_sno
.
有重叠数据
create table data
( a_sno integer not null,
b_sno integer not null,
PRIMARY KEY (a_sno,b_sno)
);
insert into data (a_sno,b_sno) values
( 4, 5 )
, ( 5, 4 )
, ( 5, 6 )
, ( 6, 5 )
, ( 6, 7 )
, ( 7, 6 )
, ( 9, 10)
, ( 9, 13)
, (10, 9 )
, (13, 9 )
, (10, 13)
, (13, 10)
, (10, 14)
, (14, 10)
, (13, 14)
, (14, 13)
, (11, 15)
, (15, 11);
从前 6 行数据值 4、5、6 和 7 可以看出,两列 intersects/overlaps 需要划分到一个组中。第 7-16 行和第 17-18 行也将分别标记为第 2 组和第 3 组。
结果输出应如下所示:
group | value
------+------
1 | 4
1 | 5
1 | 6
1 | 7
2 | 9
2 | 10
2 | 13
2 | 14
3 | 11
3 | 15
这是一个开始,可能会提供一些关于方法的想法。递归查询从每条记录的 a_sno
开始,然后尝试沿着 b_sno
的路径,直到到达终点或形成循环。路径由 sno
整数数组表示。
unnest
函数会将数组分成行,因此 sno
值映射到路径数组,例如:
4, {6, 5, 4}
将为数组中的每个值转换为一行:
4, 6
4, 5
4, 4
array_agg
然后通过将值聚合回一个路径来反转操作,但删除重复项和排序。
现在每个a_sno
都与一条路径相关联,路径构成了分组。 dense_rank
可用于将分组(集群)映射到数字。
SELECT array_agg(DISTINCT map ORDER BY map) AS cluster
,sno
FROM ( WITH RECURSIVE x(sno, path, cycle) AS (
SELECT a_sno, ARRAY[a_sno], false FROM data
UNION ALL
SELECT b_sno, path || b_sno, b_sno = ANY(path)
FROM data, x
WHERE a_sno = x.sno
AND NOT cycle
)
SELECT sno, unnest(path) AS map FROM x ORDER BY 1
) y
GROUP BY sno
ORDER BY 1, 2
输出:
cluster | sno
--------------+-----
{4,5,6,7} | 4
{4,5,6,7} | 5
{4,5,6,7} | 6
{4,5,6,7} | 7
{9,10,13,14} | 9
{9,10,13,14} | 10
{9,10,13,14} | 13
{9,10,13,14} | 14
{11,15} | 11
{11,15} | 15
(10 rows)
再绕一次排名:
SELECT dense_rank() OVER(order by cluster) AS rank
,sno
FROM (
SELECT array_agg(DISTINCT map ORDER BY map) AS cluster
,sno
FROM ( WITH RECURSIVE x(sno, path, cycle) AS (
SELECT a_sno, ARRAY[a_sno], false FROM data
UNION ALL
SELECT b_sno, path || b_sno, b_sno = ANY(path)
FROM data, x
WHERE a_sno = x.sno
AND NOT cycle
)
SELECT sno, unnest(path) AS map FROM x ORDER BY 1
) y
GROUP BY sno
ORDER BY 1, 2
) z
输出:
rank | sno
------+-----
1 | 4
1 | 5
1 | 6
1 | 7
2 | 9
2 | 10
2 | 13
2 | 14
3 | 11
3 | 15
(10 rows)
我想说另外一种方法,可能有用,你可以分两步完成:
1. 每组取 max(sno)
:
select q.sno,
row_number() over(order by q.sno) gn
from(
select distinct d.a_sno sno
from data d
where not exists (
select b_sno
from data
where b_sno=d.a_sno
and a_sno>d.a_sno
)
)q
结果:
sno gn
7 1
14 2
15 3
2.使用recursive cte
查找群组中的所有相关成员:
with recursive cte(sno,gn,path,cycle)as(
select q.sno,
row_number() over(order by q.sno) gn,
array[q.sno],false
from(
select distinct d.a_sno sno
from data d
where not exists (
select b_sno
from data
where b_sno=d.a_sno
and a_sno>d.a_sno
)
)q
union all
select d.a_sno,c.gn,
d.a_sno || c.path,
d.a_sno=any(c.path)
from data d
join cte c on d.b_sno=c.sno
where not cycle
)
select distinct gn,sno from cte
order by gn,sno
结果:
gn sno
1 4
1 5
1 6
1 7
2 9
2 10
2 13
2 14
3 11
3 15
here is the demo 我做了什么。
假设所有对都存在于它们的镜像组合中 (4,5)
和 (5,4)
。但是以下解决方案在没有镜像复制的情况下也能正常工作。
简单案例
所有连接都可以按 单个升序排列 并且像我在 fiddle 中添加的复杂情况是不可能的,我们可以使用这个解决方案而不重复rCTE:
我从每组最小值 a_sno
开始,最小值关联 b_sno
:
SELECT row_number() OVER (ORDER BY a_sno) AS grp
, a_sno, min(b_sno) AS b_sno
FROM data d
WHERE a_sno < b_sno
AND NOT EXISTS (
SELECT 1 FROM data
WHERE b_sno = d.a_sno
AND a_sno < b_sno
)
GROUP BY a_sno;
这只需要一个查询级别,因为可以在聚合上构建 window 函数:
- Get the distinct sum of a joined table column
结果:
grp a_sno b_sno
1 4 5
2 9 10
3 11 15
我避免分支和重复(倍增)行 - 长链可能 多 更昂贵。我在相关子查询中使用 ORDER BY b_sno LIMIT 1
来使它在递归 CTE 中运行。
性能的关键是匹配索引,它已经由 PK 约束提供 PRIMARY KEY (a_sno,b_sno)
:而不是相反 (b_sno, a_sno)
:
WITH RECURSIVE t AS (
SELECT row_number() OVER (ORDER BY d.a_sno) AS grp
, a_sno, min(b_sno) AS b_sno -- the smallest one
FROM data d
WHERE a_sno < b_sno
AND NOT EXISTS (
SELECT 1 FROM data
WHERE b_sno = d.a_sno
AND a_sno < b_sno
)
GROUP BY a_sno
)
, cte AS (
SELECT grp, b_sno AS sno FROM t
UNION ALL
SELECT c.grp
, (SELECT b_sno -- correlated subquery
FROM data
WHERE a_sno = c.sno
AND a_sno < b_sno
ORDER BY b_sno
LIMIT 1)
FROM cte c
WHERE c.sno IS NOT NULL
)
SELECT * FROM cte
WHERE sno IS NOT NULL -- eliminate row with NULL
UNION ALL -- no duplicates
SELECT grp, a_sno FROM t
ORDER BY grp, sno;
不太简单的案例
所有节点都可以从根(最小sno
)的一个或多个分支按升序到达。
这次把全部取大sno
,去重可能被多次访问的节点,末尾有UNION
:
WITH RECURSIVE t AS (
SELECT rank() OVER (ORDER BY d.a_sno) AS grp
, a_sno, b_sno -- get all rows for smallest a_sno
FROM data d
WHERE a_sno < b_sno
AND NOT EXISTS (
SELECT 1 FROM data
WHERE b_sno = d.a_sno
AND a_sno < b_sno
)
)
, cte AS (
SELECT grp, b_sno AS sno FROM t
UNION ALL
SELECT c.grp, d.b_sno
FROM cte c
JOIN data d ON d.a_sno = c.sno
AND d.a_sno < d.b_sno -- join to all connected rows
)
SELECT grp, sno FROM cte
UNION -- eliminate duplicates
SELECT grp, a_sno FROM t -- add first rows
ORDER BY grp, sno;
与第一个解决方案不同,我们在这里没有得到最后一行 NULL(由相关子查询引起)。
两者都应该表现得很好 - 特别是对于长链/许多分支。结果如愿:
SQL Fiddle(添加行以展示难度)。
无向图
如果存在升序遍历无法从根到达的局部极小值,上述解决方案将不起作用。在这种情况下考虑 。
我在 Postgres 中有以下 table 两列 a_sno
和 b_sno
.
create table data
( a_sno integer not null,
b_sno integer not null,
PRIMARY KEY (a_sno,b_sno)
);
insert into data (a_sno,b_sno) values
( 4, 5 )
, ( 5, 4 )
, ( 5, 6 )
, ( 6, 5 )
, ( 6, 7 )
, ( 7, 6 )
, ( 9, 10)
, ( 9, 13)
, (10, 9 )
, (13, 9 )
, (10, 13)
, (13, 10)
, (10, 14)
, (14, 10)
, (13, 14)
, (14, 13)
, (11, 15)
, (15, 11);
从前 6 行数据值 4、5、6 和 7 可以看出,两列 intersects/overlaps 需要划分到一个组中。第 7-16 行和第 17-18 行也将分别标记为第 2 组和第 3 组。
结果输出应如下所示:
group | value
------+------
1 | 4
1 | 5
1 | 6
1 | 7
2 | 9
2 | 10
2 | 13
2 | 14
3 | 11
3 | 15
这是一个开始,可能会提供一些关于方法的想法。递归查询从每条记录的 a_sno
开始,然后尝试沿着 b_sno
的路径,直到到达终点或形成循环。路径由 sno
整数数组表示。
unnest
函数会将数组分成行,因此 sno
值映射到路径数组,例如:
4, {6, 5, 4}
将为数组中的每个值转换为一行:
4, 6
4, 5
4, 4
array_agg
然后通过将值聚合回一个路径来反转操作,但删除重复项和排序。
现在每个a_sno
都与一条路径相关联,路径构成了分组。 dense_rank
可用于将分组(集群)映射到数字。
SELECT array_agg(DISTINCT map ORDER BY map) AS cluster
,sno
FROM ( WITH RECURSIVE x(sno, path, cycle) AS (
SELECT a_sno, ARRAY[a_sno], false FROM data
UNION ALL
SELECT b_sno, path || b_sno, b_sno = ANY(path)
FROM data, x
WHERE a_sno = x.sno
AND NOT cycle
)
SELECT sno, unnest(path) AS map FROM x ORDER BY 1
) y
GROUP BY sno
ORDER BY 1, 2
输出:
cluster | sno
--------------+-----
{4,5,6,7} | 4
{4,5,6,7} | 5
{4,5,6,7} | 6
{4,5,6,7} | 7
{9,10,13,14} | 9
{9,10,13,14} | 10
{9,10,13,14} | 13
{9,10,13,14} | 14
{11,15} | 11
{11,15} | 15
(10 rows)
再绕一次排名:
SELECT dense_rank() OVER(order by cluster) AS rank
,sno
FROM (
SELECT array_agg(DISTINCT map ORDER BY map) AS cluster
,sno
FROM ( WITH RECURSIVE x(sno, path, cycle) AS (
SELECT a_sno, ARRAY[a_sno], false FROM data
UNION ALL
SELECT b_sno, path || b_sno, b_sno = ANY(path)
FROM data, x
WHERE a_sno = x.sno
AND NOT cycle
)
SELECT sno, unnest(path) AS map FROM x ORDER BY 1
) y
GROUP BY sno
ORDER BY 1, 2
) z
输出:
rank | sno
------+-----
1 | 4
1 | 5
1 | 6
1 | 7
2 | 9
2 | 10
2 | 13
2 | 14
3 | 11
3 | 15
(10 rows)
我想说另外一种方法,可能有用,你可以分两步完成:
1. 每组取 max(sno)
:
select q.sno,
row_number() over(order by q.sno) gn
from(
select distinct d.a_sno sno
from data d
where not exists (
select b_sno
from data
where b_sno=d.a_sno
and a_sno>d.a_sno
)
)q
结果:
sno gn
7 1
14 2
15 3
2.使用recursive cte
查找群组中的所有相关成员:
with recursive cte(sno,gn,path,cycle)as(
select q.sno,
row_number() over(order by q.sno) gn,
array[q.sno],false
from(
select distinct d.a_sno sno
from data d
where not exists (
select b_sno
from data
where b_sno=d.a_sno
and a_sno>d.a_sno
)
)q
union all
select d.a_sno,c.gn,
d.a_sno || c.path,
d.a_sno=any(c.path)
from data d
join cte c on d.b_sno=c.sno
where not cycle
)
select distinct gn,sno from cte
order by gn,sno
结果:
gn sno
1 4
1 5
1 6
1 7
2 9
2 10
2 13
2 14
3 11
3 15
here is the demo 我做了什么。
假设所有对都存在于它们的镜像组合中 (4,5)
和 (5,4)
。但是以下解决方案在没有镜像复制的情况下也能正常工作。
简单案例
所有连接都可以按 单个升序排列 并且像我在 fiddle 中添加的复杂情况是不可能的,我们可以使用这个解决方案而不重复rCTE:
我从每组最小值 a_sno
开始,最小值关联 b_sno
:
SELECT row_number() OVER (ORDER BY a_sno) AS grp
, a_sno, min(b_sno) AS b_sno
FROM data d
WHERE a_sno < b_sno
AND NOT EXISTS (
SELECT 1 FROM data
WHERE b_sno = d.a_sno
AND a_sno < b_sno
)
GROUP BY a_sno;
这只需要一个查询级别,因为可以在聚合上构建 window 函数:
- Get the distinct sum of a joined table column
结果:
grp a_sno b_sno
1 4 5
2 9 10
3 11 15
我避免分支和重复(倍增)行 - 长链可能 多 更昂贵。我在相关子查询中使用 ORDER BY b_sno LIMIT 1
来使它在递归 CTE 中运行。
性能的关键是匹配索引,它已经由 PK 约束提供 PRIMARY KEY (a_sno,b_sno)
:而不是相反 :(b_sno, a_sno)
WITH RECURSIVE t AS (
SELECT row_number() OVER (ORDER BY d.a_sno) AS grp
, a_sno, min(b_sno) AS b_sno -- the smallest one
FROM data d
WHERE a_sno < b_sno
AND NOT EXISTS (
SELECT 1 FROM data
WHERE b_sno = d.a_sno
AND a_sno < b_sno
)
GROUP BY a_sno
)
, cte AS (
SELECT grp, b_sno AS sno FROM t
UNION ALL
SELECT c.grp
, (SELECT b_sno -- correlated subquery
FROM data
WHERE a_sno = c.sno
AND a_sno < b_sno
ORDER BY b_sno
LIMIT 1)
FROM cte c
WHERE c.sno IS NOT NULL
)
SELECT * FROM cte
WHERE sno IS NOT NULL -- eliminate row with NULL
UNION ALL -- no duplicates
SELECT grp, a_sno FROM t
ORDER BY grp, sno;
不太简单的案例
所有节点都可以从根(最小sno
)的一个或多个分支按升序到达。
这次把全部取大sno
,去重可能被多次访问的节点,末尾有UNION
:
WITH RECURSIVE t AS (
SELECT rank() OVER (ORDER BY d.a_sno) AS grp
, a_sno, b_sno -- get all rows for smallest a_sno
FROM data d
WHERE a_sno < b_sno
AND NOT EXISTS (
SELECT 1 FROM data
WHERE b_sno = d.a_sno
AND a_sno < b_sno
)
)
, cte AS (
SELECT grp, b_sno AS sno FROM t
UNION ALL
SELECT c.grp, d.b_sno
FROM cte c
JOIN data d ON d.a_sno = c.sno
AND d.a_sno < d.b_sno -- join to all connected rows
)
SELECT grp, sno FROM cte
UNION -- eliminate duplicates
SELECT grp, a_sno FROM t -- add first rows
ORDER BY grp, sno;
与第一个解决方案不同,我们在这里没有得到最后一行 NULL(由相关子查询引起)。
两者都应该表现得很好 - 特别是对于长链/许多分支。结果如愿:
SQL Fiddle(添加行以展示难度)。
无向图
如果存在升序遍历无法从根到达的局部极小值,上述解决方案将不起作用。在这种情况下考虑