SQL 服务器 - 使用 effectiveFrom 和 EffectiveTo 日期更新数据 - Upsert 优化
SQL Server - Updating data with effectiveFrom and EffectiveTo dates - Upsert optimisation
我们需要创建一个保持时间有效性的 table(即对于给定的密钥,在本例中下面 table 中的 Md5,不会有重叠的时间段)。用户需要能够设置 EffectiveFrom
和 EffectiveTo
日期,因此临时 table 没有用,因为它们似乎只允许系统生成日期。用例是批量数据将在设置的有效日期范围内上传,这需要应用于现有数据以确保没有时间重叠。
Table 定义:
IF OBJECT_ID('dbo.IngestedData', 'U') IS NOT NULL
DROP TABLE IngestedData;
CREATE TABLE IngestedData
(
ID INT IDENTITY(1,1),
Md5 VARCHAR(15) NOT NULL,
EffectiveFrom DATE NOT NULL,
EffectiveTo DATE NOT NULL,
UpdateUser VARCHAR(50),
JsonData VARCHAR(MAX),
CONSTRAINT CK_IngestedData_Start_End CHECK (EffectiveFrom < EffectiveTo),
CONSTRAINT UK_IngestedData_Md5_Start_End UNIQUE(Md5, EffectiveFrom),
PRIMARY KEY (Id)
);
CREATE NONCLUSTERED INDEX AK_IngestedData_Md5
ON IngestedData (Md5);
CREATE NONCLUSTERED INDEX AK_IngestedData_EffectiveFrom
ON IngestedData (EffectiveFrom);
CREATE NONCLUSTERED INDEX AK_IngestedData_EffectiveTo
ON IngestedData (EffectiveTo);
我编写了一个更新插入过程,它适用于单行更新,如下所示:
更新过程:
CREATE PROCEDURE dbo.usp_UpsertIngestedDataRow
@Md5 VARCHAR(20),
@EffectiveFrom DateTime,
@EffectiveTo DateTime,
@UpdateUser VARCHAR(50),
@JsonData VARCHAR(MAX)
AS
BEGIN
SET NOCOUNT ON;
BEGIN TRY;
BEGIN TRANSACTION;
--Select the data that needs to be modified along with the action to be taken
WITH NewRow(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
(
SELECT NULL, @Md5, @EffectiveFrom, @EffectiveTo, @UpdateUser, @JsonData, 'I'
),
OverlappingRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData) AS
(
SELECT
X.ID, X.Md5, X.EffectiveFrom, X.EffectiveTo, X.UpdateUser, X.JsonData
FROM
NewRow A
JOIN
IngestedData X ON (X.EffectiveFrom < A.EffectiveTo
AND X.EffectiveTo > A.EffectiveFrom)
AND A.Md5 = X.Md5
),
NewStartRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
(
SELECT
s.ID, s.Md5, s.EffectiveFrom,
(SELECT DATEADD(DAY, -1, MIN(EffectiveFrom))
FROM NewRow),
s.UpdateUser, s.JsonData, 'I'
FROM
OverlappingRows s
WHERE
EffectiveFrom < (SELECT MIN(EffectiveFrom) FROM NewRow)
),
NewEndRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
(
SELECT
s.ID, s.Md5,
(SELECT DATEADD(DAY, 1, MIN(EffectiveTo))
FROM NewRow),
s.EffectiveTo, s.UpdateUser, s.JsonData, 'I'
FROM
OverlappingRows s
WHERE
EffectiveTo > (SELECT MAX(EffectiveTo) FROM NewRow)
),
DeleteRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
(
SELECT
del.ID, del.Md5, del.EffectiveFrom, del.EffectiveTo,
del.UpdateUser, del.JsonData, 'D'
FROM
OverlappingRows del
INNER JOIN
NewRow n ON n.EffectiveFrom <= del.EffectiveFrom
AND n.EffectiveTo >= del.EffectiveTo
)
SELECT *
INTO #Temp
FROM
(SELECT * FROM NewRow
UNION
SELECT * FROM NewStartRows
UNION
SELECT * FROM NewEndRows
UNION
SELECT * FROM DeleteRows) AS Data;
--Delete any rows that are being replaced
DELETE FROM IngestedData WHERE ID IN (SELECT DISTINCT ID FROM #Temp)
--Insert the replacement
INSERT INTO IngestedData(Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData)
SELECT Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData
FROM #Temp
WHERE [Action] = 'I'
--Drop temp table
IF OBJECT_ID('tempdb.dbo.#Temp', 'U') IS NOT NULL
DROP TABLE #Temp
COMMIT;
END TRY
BEGIN CATCH
ROLLBACK;
THROW;
END CATCH
END
GO
单个调用性能良好,即使 table 填充了 10,000,000 条记录,调用也需要大约 7 毫秒。问题是进行大量更新。通过游标对 35,000 条记录执行上述存储过程大约需要 5 分钟。
我尝试重写程序以获取一个 table 变量,这将允许 DML 使用集合操作,但在逻辑中迷失了方向。任何人都可以帮助将上述逻辑转换为遵循此模式的基于集合的更新:
新存储过程:
CREATE PROCEDURE usp_BulkUpsertIngestedData
@UpdateUser VARCHAR(15),
@NewRows DataIngestionRecord READONLY
AS
BEGIN
类型定义
CREATE TYPE DataIngestionRecord AS TABLE
(
Md5 VARCHAR(15) NOT NULL,
EffectiveFrom DATE NOT NULL,
EffectiveTo DATE NOT NULL,
JsonData VARCHAR(MAX)
)
在过程中尝试禁用然后重建索引和删除 CTE 后,我发现在使用逐行更新时,性能实际上根本没有提高。
我采取了另一种策略并决定我可以通过指定在任何给定更新中每个唯一的 Md5 只能应用一个新的时间范围来限制更新插入用例。这简化了将存储过程转换为基于集合的操作所需的逻辑(并且符合我们的要求)。
我确实采纳了@Tanner 的建议并从存储过程中删除了链式 CTE。最终的存储过程为:
CREATE PROCEDURE dbo.usp_UpsertIngestedDataSet
@NewRows DataIngestionRecord READONLY,
@UpdateUser VARCHAR(15)
AS
BEGIN
SET NOCOUNT ON;
--Ensure that there are not multiple temporal regions in the update data for a given key
SELECT Md5
INTO #Duplicates
FROM @NewRows
GROUP BY Md5
HAVING COUNT(*) > 1;
IF(@@ROWCOUNT > 0) BEGIN
DECLARE @Err VARCHAR(MAX)
SELECT @Err = COALESCE(@Err + CHAR(13), '') + Md5
FROM #Duplicates
ORDER BY Md5;
SET @Err = 'The following Md5 values have multiple temporal ranges in the uploaded data which is not supported: ' + char(13) + @Err;
THROW 50002, @Err, 1;
END
--Determine all overlapping rows from the existing data set
SELECT id.ID, id.Md5, id.EffectiveFrom, id.EffectiveTo, id.UpdateUser, id.JsonData
INTO #OverlappingRecords
FROM IngestedData id JOIN @NewRows nr ON
id.Md5 = nr.Md5 AND
(id.EffectiveFrom < nr.EffectiveTo
AND id.EffectiveTo > nr.EffectiveFrom)
--Calculate truncation of left overlapping rows
SELECT ol.Id,ol.Md5, ol.EffectiveFrom, DATEADD(DAY,-1, nr.EffectiveFrom) AS EffectiveTo, 'U' AS Action
INTO #Changes
FROM #OverlappingRecords ol JOIN @NewRows nr ON
ol.Md5 = nr.Md5
AND ol.EffectiveFrom < nr.EffectiveFrom
--Calculate truncation of right overlapping rows
INSERT INTO #Changes
SELECT ol.ID, ol.Md5, DATEADD(DAY,1,nr.EffectiveTo), ol.EffectiveTo, 'U'
FROM #OverlappingRecords ol JOIN @NewRows nr ON
ol.Md5 = nr.Md5
AND ol.EffectiveTo > nr.EffectiveTo
AND ol.EffectiveFrom > nr.EffectiveFrom;
--If any area overlaps both the left and right of a new region we need a new insert for the right overlap
SELECT ol.ID, ol.Md5, DATEADD(DAY,1,nr.EffectiveTo) AS EffectiveFrom, ol.EffectiveTo, 'I' AS [Action]
INTO #InsertRecords
FROM #OverlappingRecords ol JOIN @NewRows nr ON
ol.Md5 = nr.Md5
AND ol.EffectiveTo > nr.EffectiveTo
AND ol.EffectiveFrom < nr.EffectiveFrom;
BEGIN TRANSACTION;
--Delete all overwritten regions (i.e. existing temporal ranges that are completely replaced by a new range)
DELETE FROM IngestedData
WHERE ID IN (SELECT ol.ID
FROM #OverlappingRecords ol JOIN @NewRows nr ON
ol.Md5 = nr.Md5
AND nr.EffectiveFrom <= ol.EffectiveFrom
AND nr.EffectiveTo >= ol.EffectiveTo);
--Insert New Data (both from uploaded data and from existing region splits)
INSERT INTO IngestedData (Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData)
SELECT Md5, EffectiveFrom, EffectiveTo, 'user2', JsonData
FROM @NewRows
UNION
SELECT id.Md5,ir.EffectiveFrom, ir.EffectiveTo,id.UpdateUser,id.JsonData
FROM IngestedData id JOIN #InsertRecords ir
ON id.ID = ir.ID AND ir.[Action] = 'I';
--Update truncated rows
Update id
SET EffectiveFrom = u.EffectiveFrom, EffectiveTo = u.EffectiveTo
FROM IngestedData id JOIN #Changes u ON id.ID = u.ID AND u.[Action] = 'U';
COMMIT;
END
GO
将此代码转换为基于集合的逻辑产生了不同,此版本现在可以在区区 7370 毫秒内完成对 1,000,000 行数据的 20,000 次更新。
我们需要创建一个保持时间有效性的 table(即对于给定的密钥,在本例中下面 table 中的 Md5,不会有重叠的时间段)。用户需要能够设置 EffectiveFrom
和 EffectiveTo
日期,因此临时 table 没有用,因为它们似乎只允许系统生成日期。用例是批量数据将在设置的有效日期范围内上传,这需要应用于现有数据以确保没有时间重叠。
Table 定义:
IF OBJECT_ID('dbo.IngestedData', 'U') IS NOT NULL
DROP TABLE IngestedData;
CREATE TABLE IngestedData
(
ID INT IDENTITY(1,1),
Md5 VARCHAR(15) NOT NULL,
EffectiveFrom DATE NOT NULL,
EffectiveTo DATE NOT NULL,
UpdateUser VARCHAR(50),
JsonData VARCHAR(MAX),
CONSTRAINT CK_IngestedData_Start_End CHECK (EffectiveFrom < EffectiveTo),
CONSTRAINT UK_IngestedData_Md5_Start_End UNIQUE(Md5, EffectiveFrom),
PRIMARY KEY (Id)
);
CREATE NONCLUSTERED INDEX AK_IngestedData_Md5
ON IngestedData (Md5);
CREATE NONCLUSTERED INDEX AK_IngestedData_EffectiveFrom
ON IngestedData (EffectiveFrom);
CREATE NONCLUSTERED INDEX AK_IngestedData_EffectiveTo
ON IngestedData (EffectiveTo);
我编写了一个更新插入过程,它适用于单行更新,如下所示:
更新过程:
CREATE PROCEDURE dbo.usp_UpsertIngestedDataRow
@Md5 VARCHAR(20),
@EffectiveFrom DateTime,
@EffectiveTo DateTime,
@UpdateUser VARCHAR(50),
@JsonData VARCHAR(MAX)
AS
BEGIN
SET NOCOUNT ON;
BEGIN TRY;
BEGIN TRANSACTION;
--Select the data that needs to be modified along with the action to be taken
WITH NewRow(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
(
SELECT NULL, @Md5, @EffectiveFrom, @EffectiveTo, @UpdateUser, @JsonData, 'I'
),
OverlappingRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData) AS
(
SELECT
X.ID, X.Md5, X.EffectiveFrom, X.EffectiveTo, X.UpdateUser, X.JsonData
FROM
NewRow A
JOIN
IngestedData X ON (X.EffectiveFrom < A.EffectiveTo
AND X.EffectiveTo > A.EffectiveFrom)
AND A.Md5 = X.Md5
),
NewStartRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
(
SELECT
s.ID, s.Md5, s.EffectiveFrom,
(SELECT DATEADD(DAY, -1, MIN(EffectiveFrom))
FROM NewRow),
s.UpdateUser, s.JsonData, 'I'
FROM
OverlappingRows s
WHERE
EffectiveFrom < (SELECT MIN(EffectiveFrom) FROM NewRow)
),
NewEndRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
(
SELECT
s.ID, s.Md5,
(SELECT DATEADD(DAY, 1, MIN(EffectiveTo))
FROM NewRow),
s.EffectiveTo, s.UpdateUser, s.JsonData, 'I'
FROM
OverlappingRows s
WHERE
EffectiveTo > (SELECT MAX(EffectiveTo) FROM NewRow)
),
DeleteRows(ID, Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData, [Action]) AS
(
SELECT
del.ID, del.Md5, del.EffectiveFrom, del.EffectiveTo,
del.UpdateUser, del.JsonData, 'D'
FROM
OverlappingRows del
INNER JOIN
NewRow n ON n.EffectiveFrom <= del.EffectiveFrom
AND n.EffectiveTo >= del.EffectiveTo
)
SELECT *
INTO #Temp
FROM
(SELECT * FROM NewRow
UNION
SELECT * FROM NewStartRows
UNION
SELECT * FROM NewEndRows
UNION
SELECT * FROM DeleteRows) AS Data;
--Delete any rows that are being replaced
DELETE FROM IngestedData WHERE ID IN (SELECT DISTINCT ID FROM #Temp)
--Insert the replacement
INSERT INTO IngestedData(Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData)
SELECT Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData
FROM #Temp
WHERE [Action] = 'I'
--Drop temp table
IF OBJECT_ID('tempdb.dbo.#Temp', 'U') IS NOT NULL
DROP TABLE #Temp
COMMIT;
END TRY
BEGIN CATCH
ROLLBACK;
THROW;
END CATCH
END
GO
单个调用性能良好,即使 table 填充了 10,000,000 条记录,调用也需要大约 7 毫秒。问题是进行大量更新。通过游标对 35,000 条记录执行上述存储过程大约需要 5 分钟。
我尝试重写程序以获取一个 table 变量,这将允许 DML 使用集合操作,但在逻辑中迷失了方向。任何人都可以帮助将上述逻辑转换为遵循此模式的基于集合的更新:
新存储过程:
CREATE PROCEDURE usp_BulkUpsertIngestedData
@UpdateUser VARCHAR(15),
@NewRows DataIngestionRecord READONLY
AS
BEGIN
类型定义
CREATE TYPE DataIngestionRecord AS TABLE
(
Md5 VARCHAR(15) NOT NULL,
EffectiveFrom DATE NOT NULL,
EffectiveTo DATE NOT NULL,
JsonData VARCHAR(MAX)
)
在过程中尝试禁用然后重建索引和删除 CTE 后,我发现在使用逐行更新时,性能实际上根本没有提高。
我采取了另一种策略并决定我可以通过指定在任何给定更新中每个唯一的 Md5 只能应用一个新的时间范围来限制更新插入用例。这简化了将存储过程转换为基于集合的操作所需的逻辑(并且符合我们的要求)。
我确实采纳了@Tanner 的建议并从存储过程中删除了链式 CTE。最终的存储过程为:
CREATE PROCEDURE dbo.usp_UpsertIngestedDataSet
@NewRows DataIngestionRecord READONLY,
@UpdateUser VARCHAR(15)
AS
BEGIN
SET NOCOUNT ON;
--Ensure that there are not multiple temporal regions in the update data for a given key
SELECT Md5
INTO #Duplicates
FROM @NewRows
GROUP BY Md5
HAVING COUNT(*) > 1;
IF(@@ROWCOUNT > 0) BEGIN
DECLARE @Err VARCHAR(MAX)
SELECT @Err = COALESCE(@Err + CHAR(13), '') + Md5
FROM #Duplicates
ORDER BY Md5;
SET @Err = 'The following Md5 values have multiple temporal ranges in the uploaded data which is not supported: ' + char(13) + @Err;
THROW 50002, @Err, 1;
END
--Determine all overlapping rows from the existing data set
SELECT id.ID, id.Md5, id.EffectiveFrom, id.EffectiveTo, id.UpdateUser, id.JsonData
INTO #OverlappingRecords
FROM IngestedData id JOIN @NewRows nr ON
id.Md5 = nr.Md5 AND
(id.EffectiveFrom < nr.EffectiveTo
AND id.EffectiveTo > nr.EffectiveFrom)
--Calculate truncation of left overlapping rows
SELECT ol.Id,ol.Md5, ol.EffectiveFrom, DATEADD(DAY,-1, nr.EffectiveFrom) AS EffectiveTo, 'U' AS Action
INTO #Changes
FROM #OverlappingRecords ol JOIN @NewRows nr ON
ol.Md5 = nr.Md5
AND ol.EffectiveFrom < nr.EffectiveFrom
--Calculate truncation of right overlapping rows
INSERT INTO #Changes
SELECT ol.ID, ol.Md5, DATEADD(DAY,1,nr.EffectiveTo), ol.EffectiveTo, 'U'
FROM #OverlappingRecords ol JOIN @NewRows nr ON
ol.Md5 = nr.Md5
AND ol.EffectiveTo > nr.EffectiveTo
AND ol.EffectiveFrom > nr.EffectiveFrom;
--If any area overlaps both the left and right of a new region we need a new insert for the right overlap
SELECT ol.ID, ol.Md5, DATEADD(DAY,1,nr.EffectiveTo) AS EffectiveFrom, ol.EffectiveTo, 'I' AS [Action]
INTO #InsertRecords
FROM #OverlappingRecords ol JOIN @NewRows nr ON
ol.Md5 = nr.Md5
AND ol.EffectiveTo > nr.EffectiveTo
AND ol.EffectiveFrom < nr.EffectiveFrom;
BEGIN TRANSACTION;
--Delete all overwritten regions (i.e. existing temporal ranges that are completely replaced by a new range)
DELETE FROM IngestedData
WHERE ID IN (SELECT ol.ID
FROM #OverlappingRecords ol JOIN @NewRows nr ON
ol.Md5 = nr.Md5
AND nr.EffectiveFrom <= ol.EffectiveFrom
AND nr.EffectiveTo >= ol.EffectiveTo);
--Insert New Data (both from uploaded data and from existing region splits)
INSERT INTO IngestedData (Md5, EffectiveFrom, EffectiveTo, UpdateUser, JsonData)
SELECT Md5, EffectiveFrom, EffectiveTo, 'user2', JsonData
FROM @NewRows
UNION
SELECT id.Md5,ir.EffectiveFrom, ir.EffectiveTo,id.UpdateUser,id.JsonData
FROM IngestedData id JOIN #InsertRecords ir
ON id.ID = ir.ID AND ir.[Action] = 'I';
--Update truncated rows
Update id
SET EffectiveFrom = u.EffectiveFrom, EffectiveTo = u.EffectiveTo
FROM IngestedData id JOIN #Changes u ON id.ID = u.ID AND u.[Action] = 'U';
COMMIT;
END
GO
将此代码转换为基于集合的逻辑产生了不同,此版本现在可以在区区 7370 毫秒内完成对 1,000,000 行数据的 20,000 次更新。