用公共元素分组 'groups'
Grouping 'groups' with common element
我有一个如下所示的 table,其中包含一个 group_id
和一些值。
group_id | value
---------+-------
1 | A
1 | B
2 | C
2 | D
2 | A
3 | E
3 | C
4 | G
4 | H
我想得到的是每个以某种方式连接的组的唯一编号。像这样:
第1组和第2组有一个共同的元素A,第1组和第3组有一个共同的元素C > 所以这实际上是一个大组。
master_id | group_id | value
----------+----------+--------
1 | 1 | A
1 | 1 | B
1 | 2 | C
1 | 2 | D
1 | 2 | A
1 | 3 | E
1 | 3 | C
2 | 4 | G
2 | 4 | H
我怎样才能得到这个master_id
?
Sql 服务器 example:
WITH GetConnected AS (
SELECT DISTINCT g1.group_id sourceGroup, g2.group_id connectedGroup
FROM @groups g1
LEFT JOIN @groups g2
ON g1.value = g2.value
UNION ALL
SELECT g1.group_id sourceGroup, g3.connectedGroup connectedGroup
FROM @groups g1
INNER JOIN @groups g2
ON g1.value = g2.value
AND g1.group_id < g2.group_id
INNER JOIN GetConnected g3
ON g3.sourceGroup = g2.group_id
AND g3.connectedGroup > g2.group_id
), GetGroups AS (
SELECT MIN(sourceGroup) sourceGroup, connectedGroup, DENSE_RANK() OVER (ORDER BY MIN(sourceGroup)) rk
FROM GetConnected
GROUP BY connectedGroup)
SELECT gg.rk master_id, g.group_id, g.value
FROM GetGroups gg
INNER JOIN @groups g
ON gg.connectedGroup = g.group_id
ORDER BY gg.rk, gg.connectedGroup, g.value
如果你考虑postgre,我有example code:
WITH RECURSIVE GetConnected AS (
SELECT DISTINCT g1.group_id sourceGroup, g2.group_id connectedGroup
FROM groups g1
LEFT JOIN groups g2
ON g1.value = g2.value
UNION
SELECT g1.group_id sourceGroup, g3.connectedGroup connectedGroup
FROM groups g1
LEFT JOIN groups g2
ON g1.value = g2.value
INNER JOIN GetConnected g3
ON g3.sourceGroup = g2.group_id
), GetGroups AS (
SELECT MIN(sourceGroup) sourceGroup, connectedGroup, DENSE_RANK() OVER (ORDER BY MIN(sourceGroup)) rk
FROM GetConnected
GROUP BY connectedGroup)
SELECT gg.rk master_id, g.group_id, g.value
FROM GetGroups gg
INNER JOIN groups g
ON gg.connectedGroup = g.group_id
ORDER BY gg.rk, gg.connectedGroup, g.value
计算master group是一个graph-walking问题,这意味着一个递归的CTE。我会通过以下方式解决这个问题:
- 根据值在组之间生成边。
- 在不访问之前的组的情况下遍历边缘。
那么主组的计算就是每个组的访问组中的最小值。
在 SQL 中,这看起来像:
with edges as (
select distinct t1.group_id as group_id_1, t2.group_id as group_id_2
from t t1 join
t t2
on t1.value = t2.value
),
cte as (
select e.group_id_1, e.group_id_2, convert(varchar(max), concat(',', group_id_1, ',', group_id_2)) as visited, 1 as lev
from edges e
union all
select cte.group_id_1, e.group_id_2,
concat(visited, e.group_id_2, ','), lev + 1
from cte join
edges e
on e.group_id_1 = cte.group_id_2
where cte.visited not like concat('%,', e.group_id_2, ',%') and lev < 5
)
select group_id_1, dense_rank() over (order by min(group_id_2)) as master_group
from cte
group by group_id_1;
Here 是一个 db<>fiddle.
我有一个如下所示的 table,其中包含一个 group_id
和一些值。
group_id | value
---------+-------
1 | A
1 | B
2 | C
2 | D
2 | A
3 | E
3 | C
4 | G
4 | H
我想得到的是每个以某种方式连接的组的唯一编号。像这样:
第1组和第2组有一个共同的元素A,第1组和第3组有一个共同的元素C > 所以这实际上是一个大组。
master_id | group_id | value
----------+----------+--------
1 | 1 | A
1 | 1 | B
1 | 2 | C
1 | 2 | D
1 | 2 | A
1 | 3 | E
1 | 3 | C
2 | 4 | G
2 | 4 | H
我怎样才能得到这个master_id
?
Sql 服务器 example:
WITH GetConnected AS (
SELECT DISTINCT g1.group_id sourceGroup, g2.group_id connectedGroup
FROM @groups g1
LEFT JOIN @groups g2
ON g1.value = g2.value
UNION ALL
SELECT g1.group_id sourceGroup, g3.connectedGroup connectedGroup
FROM @groups g1
INNER JOIN @groups g2
ON g1.value = g2.value
AND g1.group_id < g2.group_id
INNER JOIN GetConnected g3
ON g3.sourceGroup = g2.group_id
AND g3.connectedGroup > g2.group_id
), GetGroups AS (
SELECT MIN(sourceGroup) sourceGroup, connectedGroup, DENSE_RANK() OVER (ORDER BY MIN(sourceGroup)) rk
FROM GetConnected
GROUP BY connectedGroup)
SELECT gg.rk master_id, g.group_id, g.value
FROM GetGroups gg
INNER JOIN @groups g
ON gg.connectedGroup = g.group_id
ORDER BY gg.rk, gg.connectedGroup, g.value
如果你考虑postgre,我有example code:
WITH RECURSIVE GetConnected AS (
SELECT DISTINCT g1.group_id sourceGroup, g2.group_id connectedGroup
FROM groups g1
LEFT JOIN groups g2
ON g1.value = g2.value
UNION
SELECT g1.group_id sourceGroup, g3.connectedGroup connectedGroup
FROM groups g1
LEFT JOIN groups g2
ON g1.value = g2.value
INNER JOIN GetConnected g3
ON g3.sourceGroup = g2.group_id
), GetGroups AS (
SELECT MIN(sourceGroup) sourceGroup, connectedGroup, DENSE_RANK() OVER (ORDER BY MIN(sourceGroup)) rk
FROM GetConnected
GROUP BY connectedGroup)
SELECT gg.rk master_id, g.group_id, g.value
FROM GetGroups gg
INNER JOIN groups g
ON gg.connectedGroup = g.group_id
ORDER BY gg.rk, gg.connectedGroup, g.value
计算master group是一个graph-walking问题,这意味着一个递归的CTE。我会通过以下方式解决这个问题:
- 根据值在组之间生成边。
- 在不访问之前的组的情况下遍历边缘。
那么主组的计算就是每个组的访问组中的最小值。
在 SQL 中,这看起来像:
with edges as (
select distinct t1.group_id as group_id_1, t2.group_id as group_id_2
from t t1 join
t t2
on t1.value = t2.value
),
cte as (
select e.group_id_1, e.group_id_2, convert(varchar(max), concat(',', group_id_1, ',', group_id_2)) as visited, 1 as lev
from edges e
union all
select cte.group_id_1, e.group_id_2,
concat(visited, e.group_id_2, ','), lev + 1
from cte join
edges e
on e.group_id_1 = cte.group_id_2
where cte.visited not like concat('%,', e.group_id_2, ',%') and lev < 5
)
select group_id_1, dense_rank() over (order by min(group_id_2)) as master_group
from cte
group by group_id_1;
Here 是一个 db<>fiddle.