复杂递归SQL产生层次数据
Complex recursive SQL to produce hierarchical data
我正在尝试评估商店访客对 COVID-19 传播的影响。
这是一个简单的场景:
- 访客 A 走进商店并在时间 = 0 时遇到了员工 1。
- VisitorA 然后在时间 = 1 时遇到了 Employee2。
- 访客 B 走进商店并在时间 = 1 时遇到了员工 1。
- VisitorB 然后在时间 = 2 时遇到了 Employee3。
- 访客 A 离开商店。
当我收集所有访客数据以及他们随时间遇到的人时,数据集看起来像这样:
Table visitorByEmployee
:
| VisitorID | EmployeeID | Contact |
+-----------+------------+-------------------+
| 100 | X123 | 3/11/2020 1:00 |
| 100 | X124 | 3/11/2020 1:10 |
| 101 | X123 | 3/12/2020 1:11 |
| 101 | X125 | 3/11/2020 1:20 |
| 102 | X126 | 3/12/2020 10:00 |
| 102 | X124 | 3/12/2020 10:00 |
| 103 | X123 | 3/12/2020 11:00 |
| 104 | X124 | 3/12/2020 12:00 |
| 104 | X126 | 3/12/2020 12:00 |
| 105 | X126 | 3/12/2020 12:00 |
我想根据这些数据建立一个层次结构,最终可以表示如下:
每棵树代表访客对病毒传播的影响:
100
--> X123
--> 101
--> X125
--> 103
--> X124
--> 104
102
--> X126
--> 104
--> 105
--> X124
--> 104
--> X126
我试图通过首先找到根节点(不受之前访问者影响的根访问者 and/or 他们看到的员工)来做到这一点。这些是 100 和 102。
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY EmployeeID ORDER BY Contact) AS SeenOrder
INTO
#SeenOrder
FROM
visitorByEmployee
SELECT *
INTO #RootVisitors
FROM #SeenOrder
WHERE SeenOrder = 1
从 #RootVisitors
和 #SeenOrder
,我想构建一个 table 可以告诉我影响的层次结构,并且可能会导致这样的结果:
| InitVisitorID | HLevel | EmployeeID | VisitorID |
+---------------+------------+-------------------+-------------+
| 100 | 0 | X123 | 100 |
| 100 | 0 | X124 | 100 |
| 100 | 1 | X123 | 101 |
| 100 | 1 | X123 | 103 |
| 100 | 1 | X124 | 104 |
| 100 | 2 | X125 | 101 |
| 102 | 0 | X126 | 102 |
| 102 | 0 | X124 | 102 |
| 102 | 1 | X126 | 104 |
| 102 | 1 | X126 | 105 |
| 102 | 1 | X124 | 104 |
| 102 | 2 | X126 | 104 |
这是否可以使用递归 CTE 来完成?我试图这样做,但由于层次结构从访客到员工再到访客再到员工,我很难创建递归 CTE。
更新
这是我正在研究的递归 CTE。它还没有用,但我正在分享的方法是:
; WITH exposure_tree AS (
/* == Anchor with the root visitors == */
/* == You can think of this: The Employees who were exposed by the Visior == */
SELECT re.VisitorID InitVisitor,
1 as Level,
CASE WHEN 1%2=1 THEN 'Visitor' ELSE 'Employee' END ExposerType,
re.VisitorID Exposer,
re.EmployeeID Exposee,
re.SeenOrder,
re.InitialContact
FROM #SeenOrder re
WHERE re.SeenOrder = 1
/* == Recursive Part #1 ==
Get the visitors who were exposed next by the exposed employees
*/
UNION ALL
SELECT et.VisitorID InitVisitor,
Level + 1,
CASE WHEN (Level+1)%2=1 THEN 'Visitor' ELSE 'Employee' END ExposerType,
re.EmployeeID,
re.VisitorID, -- These are switched from the anchor.
re.SeenOrder,
re.InitialContact
FROM #SeenOrder re
JOIN exposure_tree et ON et.Exposee = re.EmployeeID AND re.SeenOrder > 1 AND re.InitialContact > et.InitialContact
UNION ALL
/* == Recursive Part #2 ==
Get the next employees who were exposed the second level exposed visitors
*/
SELECT et.VisitorID InitVisitor,
Level + 2,
CASE WHEN (Level+2)%2=1 THEN 'Visitor' ELSE 'Employee' END ExposerType,
re.VisitorID,
re.EmployeeID,
re.SeenOrder,
re.InitialContact
FROM #ROOT_EXPOSURES re
JOIN exposure_tree et ON re.VisitorID = et.Exposer and re.SeenOrder > 1 AND re.InitialContact > et.InitialContact
)
select top 1000 * from exposure_tree ORDER BY InitVisitor, Level
您仍然可以使用这些表编写递归 CTE。不过,编码变得很棘手。
这是 CTE。您可能需要调整它以获得您想要的。为了简单起见,我更改了列名:
with
c as (
select 'v' as type, vid as id, contact, 0 as lvl, cast(concat('/', vid, '/') as varchar(255)) as path
from (select *, row_number() over(partition by vid order by contact) as rn from v) x where rn = 1
union all
select
case when type = 'v' then 'e' else 'v' end, -- type
case when type = 'v' then v.eid else v.vid end, -- id
v.contact,
c.lvl + 1,
cast(concat(c.path, case when type = 'v' then v.eid else v.vid end, '/') as varchar(255))
from c
join v on c.lvl <= 10 and v.contact >= c.contact and (c.type = 'v' and v.vid = c.id or c.type = 'e' and v.eid = c.id)
and c.path not like concat('%', case when type = 'v' then v.eid else v.vid end, '%')
)
select * from c order by path
结果:
type id contact lvl path
---- ---- --------------------- --- -----------------------
v 100 2020-03-11 01:00:00.0 0 /100/
e X123 2020-03-11 01:00:00.0 1 /100/X123/
v 101 2020-03-12 01:11:00.0 2 /100/X123/101/
v 103 2020-03-12 11:00:00.0 2 /100/X123/103/
e X124 2020-03-11 01:10:00.0 1 /100/X124/
v 102 2020-03-12 10:00:00.0 2 /100/X124/102/
e X126 2020-03-12 10:00:00.0 3 /100/X124/102/X126/
v 104 2020-03-12 12:00:00.0 4 /100/X124/102/X126/104/
v 105 2020-03-12 12:00:00.0 4 /100/X124/102/X126/105/
v 104 2020-03-12 12:00:00.0 2 /100/X124/104/
e X126 2020-03-12 12:00:00.0 3 /100/X124/104/X126/
v 105 2020-03-12 12:00:00.0 4 /100/X124/104/X126/105/
v 101 2020-03-11 01:20:00.0 0 /101/
e X123 2020-03-12 01:11:00.0 1 /101/X123/
v 103 2020-03-12 11:00:00.0 2 /101/X123/103/
e X125 2020-03-11 01:20:00.0 1 /101/X125/
v 102 2020-03-12 10:00:00.0 0 /102/
e X124 2020-03-12 10:00:00.0 1 /102/X124/
v 104 2020-03-12 12:00:00.0 2 /102/X124/104/
e X126 2020-03-12 12:00:00.0 3 /102/X124/104/X126/
v 105 2020-03-12 12:00:00.0 4 /102/X124/104/X126/105/
e X126 2020-03-12 10:00:00.0 1 /102/X126/
v 104 2020-03-12 12:00:00.0 2 /102/X126/104/
e X124 2020-03-12 12:00:00.0 3 /102/X126/104/X124/
v 105 2020-03-12 12:00:00.0 2 /102/X126/105/
v 103 2020-03-12 11:00:00.0 0 /103/
e X123 2020-03-12 11:00:00.0 1 /103/X123/
v 104 2020-03-12 12:00:00.0 0 /104/
e X124 2020-03-12 12:00:00.0 1 /104/X124/
e X126 2020-03-12 12:00:00.0 1 /104/X126/
v 105 2020-03-12 12:00:00.0 2 /104/X126/105/
v 105 2020-03-12 12:00:00.0 0 /105/
e X126 2020-03-12 12:00:00.0 1 /105/X126/
v 104 2020-03-12 12:00:00.0 2 /105/X126/104/
e X124 2020-03-12 12:00:00.0 3 /105/X126/104/X124/
供参考,这是我用来测试的数据脚本,如果你需要创建一个SQL Fiddle到运行它:
create table v (
vid varchar(6),
eid varchar(6),
contact datetime
);
insert into v (vid, eid, contact) values ('100', 'X123', '2020-03-11 01:00:00');
insert into v (vid, eid, contact) values ('100', 'X124', '2020-03-11 01:10:00');
insert into v (vid, eid, contact) values ('101', 'X123', '2020-03-12 01:11:00');
insert into v (vid, eid, contact) values ('101', 'X125', '2020-03-11 01:20:00');
insert into v (vid, eid, contact) values ('102', 'X126', '2020-03-12 10:00:00');
insert into v (vid, eid, contact) values ('102', 'X124', '2020-03-12 10:00:00');
insert into v (vid, eid, contact) values ('103', 'X123', '2020-03-12 11:00:00');
insert into v (vid, eid, contact) values ('104', 'X124', '2020-03-12 12:00:00');
insert into v (vid, eid, contact) values ('104', 'X126', '2020-03-12 12:00:00');
insert into v (vid, eid, contact) values ('105', 'X126', '2020-03-12 12:00:00');
我正在尝试评估商店访客对 COVID-19 传播的影响。
这是一个简单的场景:
- 访客 A 走进商店并在时间 = 0 时遇到了员工 1。
- VisitorA 然后在时间 = 1 时遇到了 Employee2。
- 访客 B 走进商店并在时间 = 1 时遇到了员工 1。
- VisitorB 然后在时间 = 2 时遇到了 Employee3。
- 访客 A 离开商店。
当我收集所有访客数据以及他们随时间遇到的人时,数据集看起来像这样:
Table visitorByEmployee
:
| VisitorID | EmployeeID | Contact |
+-----------+------------+-------------------+
| 100 | X123 | 3/11/2020 1:00 |
| 100 | X124 | 3/11/2020 1:10 |
| 101 | X123 | 3/12/2020 1:11 |
| 101 | X125 | 3/11/2020 1:20 |
| 102 | X126 | 3/12/2020 10:00 |
| 102 | X124 | 3/12/2020 10:00 |
| 103 | X123 | 3/12/2020 11:00 |
| 104 | X124 | 3/12/2020 12:00 |
| 104 | X126 | 3/12/2020 12:00 |
| 105 | X126 | 3/12/2020 12:00 |
我想根据这些数据建立一个层次结构,最终可以表示如下:
每棵树代表访客对病毒传播的影响:
100
--> X123
--> 101
--> X125
--> 103
--> X124
--> 104
102
--> X126
--> 104
--> 105
--> X124
--> 104
--> X126
我试图通过首先找到根节点(不受之前访问者影响的根访问者 and/or 他们看到的员工)来做到这一点。这些是 100 和 102。
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY EmployeeID ORDER BY Contact) AS SeenOrder
INTO
#SeenOrder
FROM
visitorByEmployee
SELECT *
INTO #RootVisitors
FROM #SeenOrder
WHERE SeenOrder = 1
从 #RootVisitors
和 #SeenOrder
,我想构建一个 table 可以告诉我影响的层次结构,并且可能会导致这样的结果:
| InitVisitorID | HLevel | EmployeeID | VisitorID |
+---------------+------------+-------------------+-------------+
| 100 | 0 | X123 | 100 |
| 100 | 0 | X124 | 100 |
| 100 | 1 | X123 | 101 |
| 100 | 1 | X123 | 103 |
| 100 | 1 | X124 | 104 |
| 100 | 2 | X125 | 101 |
| 102 | 0 | X126 | 102 |
| 102 | 0 | X124 | 102 |
| 102 | 1 | X126 | 104 |
| 102 | 1 | X126 | 105 |
| 102 | 1 | X124 | 104 |
| 102 | 2 | X126 | 104 |
这是否可以使用递归 CTE 来完成?我试图这样做,但由于层次结构从访客到员工再到访客再到员工,我很难创建递归 CTE。
更新 这是我正在研究的递归 CTE。它还没有用,但我正在分享的方法是:
; WITH exposure_tree AS (
/* == Anchor with the root visitors == */
/* == You can think of this: The Employees who were exposed by the Visior == */
SELECT re.VisitorID InitVisitor,
1 as Level,
CASE WHEN 1%2=1 THEN 'Visitor' ELSE 'Employee' END ExposerType,
re.VisitorID Exposer,
re.EmployeeID Exposee,
re.SeenOrder,
re.InitialContact
FROM #SeenOrder re
WHERE re.SeenOrder = 1
/* == Recursive Part #1 ==
Get the visitors who were exposed next by the exposed employees
*/
UNION ALL
SELECT et.VisitorID InitVisitor,
Level + 1,
CASE WHEN (Level+1)%2=1 THEN 'Visitor' ELSE 'Employee' END ExposerType,
re.EmployeeID,
re.VisitorID, -- These are switched from the anchor.
re.SeenOrder,
re.InitialContact
FROM #SeenOrder re
JOIN exposure_tree et ON et.Exposee = re.EmployeeID AND re.SeenOrder > 1 AND re.InitialContact > et.InitialContact
UNION ALL
/* == Recursive Part #2 ==
Get the next employees who were exposed the second level exposed visitors
*/
SELECT et.VisitorID InitVisitor,
Level + 2,
CASE WHEN (Level+2)%2=1 THEN 'Visitor' ELSE 'Employee' END ExposerType,
re.VisitorID,
re.EmployeeID,
re.SeenOrder,
re.InitialContact
FROM #ROOT_EXPOSURES re
JOIN exposure_tree et ON re.VisitorID = et.Exposer and re.SeenOrder > 1 AND re.InitialContact > et.InitialContact
)
select top 1000 * from exposure_tree ORDER BY InitVisitor, Level
您仍然可以使用这些表编写递归 CTE。不过,编码变得很棘手。
这是 CTE。您可能需要调整它以获得您想要的。为了简单起见,我更改了列名:
with
c as (
select 'v' as type, vid as id, contact, 0 as lvl, cast(concat('/', vid, '/') as varchar(255)) as path
from (select *, row_number() over(partition by vid order by contact) as rn from v) x where rn = 1
union all
select
case when type = 'v' then 'e' else 'v' end, -- type
case when type = 'v' then v.eid else v.vid end, -- id
v.contact,
c.lvl + 1,
cast(concat(c.path, case when type = 'v' then v.eid else v.vid end, '/') as varchar(255))
from c
join v on c.lvl <= 10 and v.contact >= c.contact and (c.type = 'v' and v.vid = c.id or c.type = 'e' and v.eid = c.id)
and c.path not like concat('%', case when type = 'v' then v.eid else v.vid end, '%')
)
select * from c order by path
结果:
type id contact lvl path
---- ---- --------------------- --- -----------------------
v 100 2020-03-11 01:00:00.0 0 /100/
e X123 2020-03-11 01:00:00.0 1 /100/X123/
v 101 2020-03-12 01:11:00.0 2 /100/X123/101/
v 103 2020-03-12 11:00:00.0 2 /100/X123/103/
e X124 2020-03-11 01:10:00.0 1 /100/X124/
v 102 2020-03-12 10:00:00.0 2 /100/X124/102/
e X126 2020-03-12 10:00:00.0 3 /100/X124/102/X126/
v 104 2020-03-12 12:00:00.0 4 /100/X124/102/X126/104/
v 105 2020-03-12 12:00:00.0 4 /100/X124/102/X126/105/
v 104 2020-03-12 12:00:00.0 2 /100/X124/104/
e X126 2020-03-12 12:00:00.0 3 /100/X124/104/X126/
v 105 2020-03-12 12:00:00.0 4 /100/X124/104/X126/105/
v 101 2020-03-11 01:20:00.0 0 /101/
e X123 2020-03-12 01:11:00.0 1 /101/X123/
v 103 2020-03-12 11:00:00.0 2 /101/X123/103/
e X125 2020-03-11 01:20:00.0 1 /101/X125/
v 102 2020-03-12 10:00:00.0 0 /102/
e X124 2020-03-12 10:00:00.0 1 /102/X124/
v 104 2020-03-12 12:00:00.0 2 /102/X124/104/
e X126 2020-03-12 12:00:00.0 3 /102/X124/104/X126/
v 105 2020-03-12 12:00:00.0 4 /102/X124/104/X126/105/
e X126 2020-03-12 10:00:00.0 1 /102/X126/
v 104 2020-03-12 12:00:00.0 2 /102/X126/104/
e X124 2020-03-12 12:00:00.0 3 /102/X126/104/X124/
v 105 2020-03-12 12:00:00.0 2 /102/X126/105/
v 103 2020-03-12 11:00:00.0 0 /103/
e X123 2020-03-12 11:00:00.0 1 /103/X123/
v 104 2020-03-12 12:00:00.0 0 /104/
e X124 2020-03-12 12:00:00.0 1 /104/X124/
e X126 2020-03-12 12:00:00.0 1 /104/X126/
v 105 2020-03-12 12:00:00.0 2 /104/X126/105/
v 105 2020-03-12 12:00:00.0 0 /105/
e X126 2020-03-12 12:00:00.0 1 /105/X126/
v 104 2020-03-12 12:00:00.0 2 /105/X126/104/
e X124 2020-03-12 12:00:00.0 3 /105/X126/104/X124/
供参考,这是我用来测试的数据脚本,如果你需要创建一个SQL Fiddle到运行它:
create table v (
vid varchar(6),
eid varchar(6),
contact datetime
);
insert into v (vid, eid, contact) values ('100', 'X123', '2020-03-11 01:00:00');
insert into v (vid, eid, contact) values ('100', 'X124', '2020-03-11 01:10:00');
insert into v (vid, eid, contact) values ('101', 'X123', '2020-03-12 01:11:00');
insert into v (vid, eid, contact) values ('101', 'X125', '2020-03-11 01:20:00');
insert into v (vid, eid, contact) values ('102', 'X126', '2020-03-12 10:00:00');
insert into v (vid, eid, contact) values ('102', 'X124', '2020-03-12 10:00:00');
insert into v (vid, eid, contact) values ('103', 'X123', '2020-03-12 11:00:00');
insert into v (vid, eid, contact) values ('104', 'X124', '2020-03-12 12:00:00');
insert into v (vid, eid, contact) values ('104', 'X126', '2020-03-12 12:00:00');
insert into v (vid, eid, contact) values ('105', 'X126', '2020-03-12 12:00:00');