SQL 服务器为相关记录创建分组

SQL Server Create Grouping For Related Records

我 运行 遇到一个有趣的场景,试图将任意 FamilyId 分配给彼此相关的字段。

这是我们目前正在使用的结构:

DataId  OriginalDataId
3       1
4       1
5       1
6       1
3       2
4       2
5       2
6       2
7       10
8       10
9       10
11      15

我们试图做的是向所有彼此之间有关系的 DataId 添加一个 FamilyId 列。

在这种情况下,Id的34561有关系。但是3456也和2有关系。所以123456都应该被认为是在同一个FamilyId.

789只和10有关系,这就把它放到一个单独的FamilyId里了。 1115.

相同

我期望的结果如下:

DataId  FamilyId
1       1
2       1
3       1
4       1
5       1
6       1
7       2
8       2
9       2
10      2
11      3
15      3

示例数据、结构和查询:

Declare @Results_Stage Table
(
    DataId          BigInt Not Null,
    OriginalDataId  BigInt Null
)


Insert @Results_Stage
Values (3,1), (4,1), (5,1), (6,1), (3,2), (4,2), (5,2), (6,2), (7,10), (8, 10), (9, 10), (11, 15)


Select DataId, Row_Number() Over(Partition By DataId Order By OriginalDataId Asc) FamilyId
From   @Results_Stage       R
Union
Select OriginalDataId, Row_Number() Over(Partition By DataId Order By OriginalDataId Asc) FamilyId
From   @Results_Stage

我很肯定我的尝试远非正确,但老实说我不确定从哪里开始 - 或者在 SQL 服务器中是否有可能。

有没有人知道如何解决这个问题,或者至少,有什么能为我指明正确的方向?

编辑 下面是我到目前为止提出的一个查询,用于识别应该属于相同 FamilyId 的其他 DataId 记录=44=]

Declare @DataId BigInt = 1

;With Children As
(
    Select      Distinct X.DataId
    From        @Results_Stage  S
    Outer Apply
    (
        Select  Distinct DataId
        From    @Results_Stage  R
        Where   R.OriginalDataId = S.DataId
        Or      R.OriginalDataId = S.OriginalDataId
    ) X
    Where   S.DataId = @DataId
    Or      S.OriginalDataId = @DataId
)
Select  Distinct O.OriginalDataId
From    Children    C
Outer Apply
(
    Select  S.OriginalDataId
    From    @Results_Stage  S
    Where   S.DataId = C.DataId
) O
Union 
Select  DataId
From    Children

以下查询,它使用了 FOR XML PATH

SELECT R.OriginalDataId,
          STUFF((
             SELECT ', ' + + CAST([DataId] AS VARCHAR(MAX)) 
             FROM #Results_Stage 
             WHERE (OriginalDataId = R.OriginalDataId) 
             FOR XML PATH(''),TYPE).value('(./text())[1]','VARCHAR(MAX)')
          ,1,2,'') AS GroupValues
   FROM #Results_Stage R
   GROUP BY R.OriginalDataId

可用于生成此输出:

OriginalDataId  GroupValues
===========================
1               3, 4, 5, 6
2               3, 4, 5, 6
10              7, 8, 9
15              11

使用上面的结果集,我们可以很容易地识别每个组,从而得到可以应用 DENSE_RANK() 的东西:

;WITH GroupedData AS (
   SELECT R.OriginalDataId,
          STUFF((
             SELECT ', ' + + CAST([DataId] AS VARCHAR(MAX)) 
             FROM #Results_Stage 
             WHERE (OriginalDataId = R.OriginalDataId) 
             FOR XML PATH(''),TYPE).value('(./text())[1]','VARCHAR(MAX)')
          ,1,2,'') AS GroupValues
   FROM #Results_Stage R
   GROUP BY R.OriginalDataId
), Families AS (
   SELECT OriginalDataId, DENSE_RANK() OVER (ORDER BY GroupValues) AS FamilyId
   FROM GroupedData 
)
SELECT OriginalDataId AS DataId, FamilyId  
FROM Families

UNION 

SELECT DataId, F.FamilyId
FROM #Results_Stage R
INNER JOIN Families F ON R.OriginalDataId = F.OriginalDataId

ORDER BY FamilyId

上面的输出是:

  DataId    FamilyId
   ===================
    11      1
    15      1
    1       2
    2       2
    3       2
    4       2
    5       2
    6       2
    7       3
    8       3
    9       3
    10      3

检查这个...它看起来不太好,但正在做这项工作:)

DECLARE @T TABLE (DataId INT, OriginalDataId INT)
INSERT INTO @T(DataId , OriginalDataId)
          select 3,1
union all select 4,1
union all select 5,1
union all select 6,1
union all select 3,2
union all select 4,2
union all select 5,2
union all select 6,2
union all select 7,10
union all select 8,10
union all select 9,10
union all select 11,15


SELECT * FROM @T


;WITH f AS (
  SELECT DISTINCT OriginalDataId FROM @T
)
, m AS (
  SELECT DISTINCT 
    DataId , OriginalDataId = MIN(OriginalDataId) 
  FROM @T 
  GROUP BY DataId
)
, m2 AS (
  SELECT DISTINCT 
    x.DataId , x.OriginalDataId 
  FROM @T AS x
  LEFT OUTER JOIN  m ON x.DataId = m.DataId AND x.OriginalDataId = m.OriginalDataId
  WHERE m.DataId IS NULL
)
, m3 AS (
  SELECT DISTINCT DataId = x.OriginalDataId , m.OriginalDataId 
  FROM m2 AS x
  INNER JOIN m ON x.DataId = m.DataId
)
, m4 AS (
  SELECT  DISTINCT 
    DataId = OriginalDataId , OriginalDataId 
  FROM @T 
  WHERE OriginalDataId NOT IN(SELECT DataId FROM m3)

  UNION 
  SELECT DISTINCT 
    x.DataId  , f.OriginalDataId
  FROM f
  INNER JOIN m AS x on x.OriginalDataId = f.OriginalDataId
  WHERE x.DataId NOT IN(SELECT DataId FROM m3)

  UNION 
  SELECT DataId , OriginalDataId FROM m3
) 
, list AS (
  SELECT 
    x.DataId, FamilyId = DENSE_RANK() OVER(ORDER BY x.OriginalDataId ) 
  FROM m4 AS x
)
SELECT * FROM list


-- OUTPUT
DataId  FamilyId
1       1
2       1
3       1
4       1
5       1
6       1
7       2
8       2
9       2
10      2
11      3
15      3