连接具有 ValidFrom/ValidTo 个日期的多个表 (SCD2)
Joining multiple tables with ValidFrom/ValidTo dates (SCD2)
问题: 我如何加入多个 (3+) 表,这些表中都有 SCD 类型 2 validFrom/validTo 日期?
我有以下表格:
-- table 1
CREATE TABLE dbo.Clients (
clientCode varchar(10) NOT NULL,
startDate date NOT NULL,
[name] varchar(200) NOT NULL,
CONSTRAINT PK_Clients PRIMARY KEY CLUSTERED (clientCode, startDate)
);
-- table 2
CREATE TABLE dbo.Projects (
clientCode varchar(10) NOT NULL, --- Each project belongs to a client.
projectCode varchar(10) NOT NULL,
startDate date NOT NULL,
[name] varchar(200) NOT NULL,
CONSTRAINT PK_Projects PRIMARY KEY CLUSTERED (projectCode, startDate)
);
.. 具有以下虚拟数据:
-- dummy data
INSERT INTO dbo.Clients (clientCode, startDate, [name])
VALUES ('A', {d '2010-01-01'}, 'Client A (first)'),
('A', {d '2011-04-01'}, 'Client A (second)'),
('A', {d '2011-09-01'}, 'Client A (third)'),
('A', {d '2012-02-01'}, 'Client A (fourth)'),
('A', {d '2014-01-01'}, 'Client A (fifth)'),
('B', {d '2010-01-01'}, 'Client B (first)'),
('B', {d '2011-02-01'}, 'Client B (second)'),
('B', {d '2011-08-01'}, 'Client B (third)'),
('B', {d '2011-12-01'}, 'Client B (fourth)'),
('B', {d '2012-11-01'}, 'Client B (fifth)');
-- dummy data
INSERT INTO dbo.Projects (clientCode, projectCode, startDate, [name])
VALUES ('A', '1', {d '2010-01-15'}, 'Project 1, first revision'),
('A', '1', {d '2012-04-22'}, 'Project 1, second revision'),
('A', '2', {d '2010-02-08'}, 'Project 2, first revision'),
('A', '2', {d '2010-09-12'}, 'Project 2, second revision'),
('A', '2', {d '2012-08-18'}, 'Project 2, third revision'),
('B', '3', {d '2011-04-01'}, 'Project 3, first revision'),
('B', '3', {d '2011-12-01'}, 'Project 3, second revision'),
('B', '3', {d '2014-02-28'}, 'Project 3, third revision');
使用这两个表,我们生成开始日期和结束日期间隔:
--- Clients:
WITH c (clientCode, [name], startDate, endDate) AS (
SELECT clientCode, [name], startDate,
--- Find the next record's startDate, ordered by startDate.
LEAD(startDate, 1, {d '2099-12-31'}) OVER (
PARTITION BY clientCode
ORDER BY startDate) AS endDate
FROM dbo.Clients),
--- Projects:
p (projectCode, clientCode, [name], startDate, endDate) AS (
SELECT projectCode, clientCode, [name], startDate,
--- Find the next record's startDate, order by startDate
LEAD(startDate, 1, {d '2099-12-31'}) OVER (
PARTITION BY projectCode
ORDER BY startDate) AS endDate
FROM dbo.Projects)
SELECT c.clientCode, c.[name] AS clientName,
p.projectCode, p.[name] AS projectName,
--- Start date is the last of (c.startDate, p.startDate)
(CASE WHEN c.startDate<p.startDate THEN p.startDate ELSE c.startDate END) AS startDate,
--- End date is the first of (c.endDate, p.endDate)
(CASE WHEN c.endDate<p.endDate THEN c.endDate ELSE p.endDate END) AS endDate
FROM c
LEFT JOIN p ON
c.clientCode=p.clientCode AND
c.startDate<p.endDate AND
c.endDate>p.startDate
-- IF two new tables were introducted (t3 and t4), would the following JOINS work?
-- LEFT JOIN dbo.Table3 as t3
-- on p.clientCode = t3.clientcode AND
-- p.startdate<t3.endate AND
-- p.endDate>t3.startdate
-- LEFT JOIN dbo.Table4 as t4
-- on t3.toolId = t4.toolid AND --> toolId is a new key that I need for the join, since t4 does not have clientCode
-- t3.startdate<t4.enddate AND
-- t3.enddate>t4.startdate
ORDER BY c.clientCode, p.projectCode, 5;
我的问题:在上面查询的底部,我注释掉了LEFT JOINS,当引入更多的SCD2 表时我将不得不这样做。我不确定我所做的注释掉的 LEFT JOINS 是否有效。你看到它有什么问题吗?
添加更多 JOINS 可能会与上述查询中使用的 CASE WHEN 语句冲突..:[=15=]
--- Start date is the last of (c.startDate, p.startDate)
(CASE WHEN c.startDate<p.startDate THEN p.startDate ELSE c.startDate END) AS startDate,
--- End date is the first of (c.endDate, p.endDate)
(CASE WHEN c.endDate<p.endDate THEN c.endDate ELSE p.endDate END) AS endDate
使用这个 CASE when 语句是因为我不想让两个时间间隔引用同一个日期。因此,输出间隔由 (a.startTime, b.startTime) 中的较大者和 (a.endTime, b.endTime) 中较小的者定义。
我在这里看到一个问题,因为此 CASE WHEN 语句仅评估 2 个表的 startDate 和 endDate 间隔,而不是 3、4 个或更多表。
如何解决这个问题?
您是否有兴趣使用 SqlServer 的 geometry
数据类型来表示时间段?我在这里将它应用到您的示例中:
WITH c (clientCode, [name], Perd) AS (
SELECT clientCode, [name],
Perd=geometry::STGeomFromText('LINESTRING (' + format(startdate,'yyyyMMdd')+' 0, '+
format(LEAD(startDate, 1, {d '2099-12-31'}) OVER (
PARTITION BY clientCode
ORDER BY startDate) , 'yyyyMMdd') +' 0)', 0)
FROM #Clients),
--- Projects:
p (projectCode, clientCode, [name], Perd) AS (
SELECT projectCode, clientCode, [name],
Perd=geometry::STGeomFromText('LINESTRING (' + format(startdate,'yyyyMMdd')+' 0, '+
format(LEAD(startDate, 1, {d '2099-12-31'}) OVER (
PARTITION BY projectCode
ORDER BY startDate) , 'yyyyMMdd') +' 0)', 0)
FROM #Projects)
SELECT c.clientCode, c.[name] AS clientName,
p.projectCode, p.[name] AS projectName,
startDate=try_cast(format(c.Perd.STIntersection(p.Perd).STEndPoint().STX ,'########') as date),
endDate=try_cast(format(c.Perd.STIntersection(p.Perd).STStartPoint().STX, '########') as date)
FROM
c
inner join
p on
c.clientCode=p.clientCode AND p.Perd.STIntersection(c.Perd).STLength()>0
order by 1,5
这可以更容易 nest
作为子查询,并加入另一个时间 table。
不过,我认为对于非常大的 data-sets,这不会很快。
问题: 我如何加入多个 (3+) 表,这些表中都有 SCD 类型 2 validFrom/validTo 日期?
我有以下表格:
-- table 1
CREATE TABLE dbo.Clients (
clientCode varchar(10) NOT NULL,
startDate date NOT NULL,
[name] varchar(200) NOT NULL,
CONSTRAINT PK_Clients PRIMARY KEY CLUSTERED (clientCode, startDate)
);
-- table 2
CREATE TABLE dbo.Projects (
clientCode varchar(10) NOT NULL, --- Each project belongs to a client.
projectCode varchar(10) NOT NULL,
startDate date NOT NULL,
[name] varchar(200) NOT NULL,
CONSTRAINT PK_Projects PRIMARY KEY CLUSTERED (projectCode, startDate)
);
.. 具有以下虚拟数据:
-- dummy data
INSERT INTO dbo.Clients (clientCode, startDate, [name])
VALUES ('A', {d '2010-01-01'}, 'Client A (first)'),
('A', {d '2011-04-01'}, 'Client A (second)'),
('A', {d '2011-09-01'}, 'Client A (third)'),
('A', {d '2012-02-01'}, 'Client A (fourth)'),
('A', {d '2014-01-01'}, 'Client A (fifth)'),
('B', {d '2010-01-01'}, 'Client B (first)'),
('B', {d '2011-02-01'}, 'Client B (second)'),
('B', {d '2011-08-01'}, 'Client B (third)'),
('B', {d '2011-12-01'}, 'Client B (fourth)'),
('B', {d '2012-11-01'}, 'Client B (fifth)');
-- dummy data
INSERT INTO dbo.Projects (clientCode, projectCode, startDate, [name])
VALUES ('A', '1', {d '2010-01-15'}, 'Project 1, first revision'),
('A', '1', {d '2012-04-22'}, 'Project 1, second revision'),
('A', '2', {d '2010-02-08'}, 'Project 2, first revision'),
('A', '2', {d '2010-09-12'}, 'Project 2, second revision'),
('A', '2', {d '2012-08-18'}, 'Project 2, third revision'),
('B', '3', {d '2011-04-01'}, 'Project 3, first revision'),
('B', '3', {d '2011-12-01'}, 'Project 3, second revision'),
('B', '3', {d '2014-02-28'}, 'Project 3, third revision');
使用这两个表,我们生成开始日期和结束日期间隔:
--- Clients:
WITH c (clientCode, [name], startDate, endDate) AS (
SELECT clientCode, [name], startDate,
--- Find the next record's startDate, ordered by startDate.
LEAD(startDate, 1, {d '2099-12-31'}) OVER (
PARTITION BY clientCode
ORDER BY startDate) AS endDate
FROM dbo.Clients),
--- Projects:
p (projectCode, clientCode, [name], startDate, endDate) AS (
SELECT projectCode, clientCode, [name], startDate,
--- Find the next record's startDate, order by startDate
LEAD(startDate, 1, {d '2099-12-31'}) OVER (
PARTITION BY projectCode
ORDER BY startDate) AS endDate
FROM dbo.Projects)
SELECT c.clientCode, c.[name] AS clientName,
p.projectCode, p.[name] AS projectName,
--- Start date is the last of (c.startDate, p.startDate)
(CASE WHEN c.startDate<p.startDate THEN p.startDate ELSE c.startDate END) AS startDate,
--- End date is the first of (c.endDate, p.endDate)
(CASE WHEN c.endDate<p.endDate THEN c.endDate ELSE p.endDate END) AS endDate
FROM c
LEFT JOIN p ON
c.clientCode=p.clientCode AND
c.startDate<p.endDate AND
c.endDate>p.startDate
-- IF two new tables were introducted (t3 and t4), would the following JOINS work?
-- LEFT JOIN dbo.Table3 as t3
-- on p.clientCode = t3.clientcode AND
-- p.startdate<t3.endate AND
-- p.endDate>t3.startdate
-- LEFT JOIN dbo.Table4 as t4
-- on t3.toolId = t4.toolid AND --> toolId is a new key that I need for the join, since t4 does not have clientCode
-- t3.startdate<t4.enddate AND
-- t3.enddate>t4.startdate
ORDER BY c.clientCode, p.projectCode, 5;
我的问题:在上面查询的底部,我注释掉了LEFT JOINS,当引入更多的SCD2 表时我将不得不这样做。我不确定我所做的注释掉的 LEFT JOINS 是否有效。你看到它有什么问题吗?
添加更多 JOINS 可能会与上述查询中使用的 CASE WHEN 语句冲突..:[=15=]
--- Start date is the last of (c.startDate, p.startDate)
(CASE WHEN c.startDate<p.startDate THEN p.startDate ELSE c.startDate END) AS startDate,
--- End date is the first of (c.endDate, p.endDate)
(CASE WHEN c.endDate<p.endDate THEN c.endDate ELSE p.endDate END) AS endDate
使用这个 CASE when 语句是因为我不想让两个时间间隔引用同一个日期。因此,输出间隔由 (a.startTime, b.startTime) 中的较大者和 (a.endTime, b.endTime) 中较小的者定义。
我在这里看到一个问题,因为此 CASE WHEN 语句仅评估 2 个表的 startDate 和 endDate 间隔,而不是 3、4 个或更多表。
如何解决这个问题?
您是否有兴趣使用 SqlServer 的 geometry
数据类型来表示时间段?我在这里将它应用到您的示例中:
WITH c (clientCode, [name], Perd) AS (
SELECT clientCode, [name],
Perd=geometry::STGeomFromText('LINESTRING (' + format(startdate,'yyyyMMdd')+' 0, '+
format(LEAD(startDate, 1, {d '2099-12-31'}) OVER (
PARTITION BY clientCode
ORDER BY startDate) , 'yyyyMMdd') +' 0)', 0)
FROM #Clients),
--- Projects:
p (projectCode, clientCode, [name], Perd) AS (
SELECT projectCode, clientCode, [name],
Perd=geometry::STGeomFromText('LINESTRING (' + format(startdate,'yyyyMMdd')+' 0, '+
format(LEAD(startDate, 1, {d '2099-12-31'}) OVER (
PARTITION BY projectCode
ORDER BY startDate) , 'yyyyMMdd') +' 0)', 0)
FROM #Projects)
SELECT c.clientCode, c.[name] AS clientName,
p.projectCode, p.[name] AS projectName,
startDate=try_cast(format(c.Perd.STIntersection(p.Perd).STEndPoint().STX ,'########') as date),
endDate=try_cast(format(c.Perd.STIntersection(p.Perd).STStartPoint().STX, '########') as date)
FROM
c
inner join
p on
c.clientCode=p.clientCode AND p.Perd.STIntersection(c.Perd).STLength()>0
order by 1,5
这可以更容易 nest
作为子查询,并加入另一个时间 table。
不过,我认为对于非常大的 data-sets,这不会很快。