加速 SQL 服务器交叉应用以获取聚合数据
Speeding up SQL Server cross apply to get aggregated data
在 SQL 服务器中,我试图将一个查询放在一起,该查询获取一行并包括该行之前两小时 window 的聚合数据以及来自一个小时的聚合数据小时 window 之后。我怎样才能使这个 运行 更快?
这些行的时间戳精确到毫秒,并且间隔不均匀。我在这个 table 中有超过 5000 万 行,而且查询似乎没有完成。很多地方都有索引,但是好像没什么用。我也在考虑使用 window 函数,但我不确定它是否可能有一个滑动的 window 和不均匀分布的行。此外,对于未来的一小时 window,我不确定如何使用 SQL window。
Box 是一个字符串,有 10 个唯一值。
Process 是一个字符串,有 30 个唯一值。
平均 duration_ms 是 200 毫秒。
错误占数据的不到 0.1%。
5000 万行描述了一年的数据。
select
c1.start_time,
c1.end_time,
c1.box,
c1.process,
datediff(ms,c1.start_time,c1.end_time) as duration_ms,
datepart(dw,c1.start_time) as day_of_week,
datepart(hour,c1.start_time) as hour_of_day,
c3.*,
c5.*
from metrics_table c1
cross apply
(select
avg(cast(datediff(ms,c2.start_time,c2.end_time) as numeric)) as avg_ms,
count(1) as num_process_total,
count(distinct process) as num_process_unique,
count(distinct box) as num_box_unique
from metrics_table c2
where datediff(minute,c2.start_time,c1.start_time) <= 120
and c1.start_time> c2.start_time
and c2.error_code = 0
) c3
cross apply
(select
avg(case when datediff(ms,c4.start_time,c4.end_time)>1000 then 1.0 else 0.0 end) as percent_over_thresh
from metrics_table c4
where datediff(hour,c1.start_time,c4.start_time) <= 1
and c4.start_time> c1.start_time
and c4.error_code= 0
) c5
where
c1.error_code= 0
编辑
版本:SQLAzure 12.0
添加执行计划:
以下应该是朝着正确方向迈出的一步...
注意:c2.start_time & c4.start_time 不再包装在 DATEDIFF 函数中,使它们可 SARGable...
SELECT
c1.start_time,
c1.end_time,
c1.box,
c1.process,
DATEDIFF(ms, c1.start_time, c1.end_time) AS duration_ms,
DATEPART(dw, c1.start_time) AS day_of_week,
DATEPART(HOUR, c1.start_time) AS hour_of_day,
--c3.*,
avg_ms = CASE WHEN
c5.*
FROM
dbo.metrics_table c1
CROSS APPLY (
SELECT
AVG(CAST(DATEDIFF(ms, c2.start_time, c2.end_time) AS NUMERIC)) AS avg_ms,
COUNT(1) AS num_process_total,
COUNT(DISTINCT process) AS num_process_unique,
COUNT(DISTINCT box) AS num_box_unique
FROM
dbo.metrics_table c2
WHERE
--DATEDIFF(minute,c2.start_time,c1.start_time) <= 120
c2.start_time <= DATEADD(MINUTE, -120, c1.start_time)
--and c1.start_time> c2.start_time
AND c2.error_code = 0
) c3
CROSS APPLY (
SELECT
AVG(CASE WHEN DATEDIFF(ms, c4.start_time, c4.end_time) > 1000 THEN 1.0 ELSE 0.0 END
) AS percent_over_thresh
FROM
dbo.metrics_table c4
WHERE
--DATEDIFF(HOUR, c1.start_time, c4.start_time) <= 1
c4.start_time >= DATEADD(HOUR, 1, c1.start_time)
--and c4.start_time> c1.start_time
AND c4.error_code = 0
) c5
WHERE
c1.error_code = 0;
当然,除非有合适的可用索引,否则使查询 SARGable 没有任何好处。以下应该适用于所有 3 metrics_table 个引用...(查看当前可用的索引,有可能您可能不需要创建新索引)
CREATE NONCLUSTERED INDEX ixf_metricstable_errorcode_starttime ON dbo.metrics_table (
error_code,
start_time
)
INCLUDE (
end_time,
box,
process
)
WHERE
error_code = 0;
我使用 Between
并在我的简单测试平台上获得了良好的性能。我还使用了 columnstore,因为 5000 万条记录是 DW 卷:
CREATE TABLE dbo.metrics_table (
rowId INT IDENTITY,
start_time DATETIME NOT NULL,
end_time DATETIME NOT NULL,
box VARCHAR(10) NOT NULL,
process VARCHAR(10) NOT NULL,
error_code INT NOT NULL
);
-- Add records
;WITH cte AS (
SELECT TOP 3334 ROW_NUMBER() OVER ( ORDER BY ( SELECT 1 ) ) rn
FROM sys.columns c1
CROSS JOIN sys.columns c2
CROSS JOIN sys.columns c3
)
INSERT INTO dbo.metrics_table ( start_time, end_time, box, process, error_code )
SELECT
DATEADD( ms, rn, DATEADD( day, rn % 365, '1 Jan 2017' ) ) AS start_time,
DATEADD( ms, rn % 409, DATEADD( ms, rn, DATEADD( day, rn % 365, '1 Jan 2017' ) ) ) AS end_time,
'box' + CAST( boxes.box AS VARCHAR(10) ) box,
'process' + CAST( boxes.box AS VARCHAR(10) ) process,
ABS( CAST( rn % 3000 AS BIT ) -1 ) error_code
FROM cte c
CROSS JOIN ( SELECT TOP 10 rn FROM cte ) AS boxes(box)
CROSS JOIN ( SELECT TOP 30 rn FROM cte ) AS processes(process);
-- Create normal clustered index to order the data
CREATE CLUSTERED INDEX cci_metrics_table ON dbo.metrics_table ( start_time, end_time, box, process );
--CREATE CLUSTERED INDEX cci_metrics_table ON dbo.metrics_table ( box, process, start_time, end_time );
-- Convert to columnstore
CREATE CLUSTERED COLUMNSTORE INDEX cci_metrics_table ON dbo.metrics_table WITH ( MAXDOP = 1, DROP_EXISTING = ON );
IF OBJECT_ID('tempdb..#tmp1' ) IS NOT NULL DROP TABLE #tmp1
-- two hour window before, 1 hour window after
SELECT
c1.start_time,
c1.end_time,
c1.box,
c1.process,
DATEDIFF( ms, c1.start_time, c1.end_time ) AS duration_ms,
DATEPART( dw, c1.start_time ) AS day_of_week,
DATEPART( hour, c1.start_time ) AS hour_of_day,
c2.xavg,
c2.num_process_total,
c2.num_process_unique,
c2.num_box_unique,
c3.percent_over_thresh
INTO #tmp1
FROM dbo.metrics_table c1
CROSS APPLY
(
SELECT
COUNT(1) AS num_process_total,
AVG( CAST( DATEDIFF( ms, start_time, end_time ) AS NUMERIC ) ) xavg,
COUNT( DISTINCT process ) num_process_unique,
COUNT( DISTINCT box ) num_box_unique
FROM dbo.metrics_table c2
WHERE c2.error_code = 0
AND c2.start_time Between DATEADD( minute, -120, c1.start_time ) And c1.start_time
AND c1.start_time > c2.start_time
) c2
CROSS APPLY
(
SELECT
AVG( CASE WHEN DATEDIFF( ms, c4.start_time, c4.end_time ) > 1000 THEN 1.0 ELSE 0.0 END ) percent_over_thresh
FROM dbo.metrics_table c4
WHERE c4.error_code = 0
AND c4.start_time Between c1.start_time And DATEADD( minute, 60, c1.start_time )
AND c4.start_time > c1.start_time
) c3
WHERE error_code = 0
在 SQL 服务器中,我试图将一个查询放在一起,该查询获取一行并包括该行之前两小时 window 的聚合数据以及来自一个小时的聚合数据小时 window 之后。我怎样才能使这个 运行 更快?
这些行的时间戳精确到毫秒,并且间隔不均匀。我在这个 table 中有超过 5000 万 行,而且查询似乎没有完成。很多地方都有索引,但是好像没什么用。我也在考虑使用 window 函数,但我不确定它是否可能有一个滑动的 window 和不均匀分布的行。此外,对于未来的一小时 window,我不确定如何使用 SQL window。
Box 是一个字符串,有 10 个唯一值。 Process 是一个字符串,有 30 个唯一值。 平均 duration_ms 是 200 毫秒。 错误占数据的不到 0.1%。 5000 万行描述了一年的数据。
select
c1.start_time,
c1.end_time,
c1.box,
c1.process,
datediff(ms,c1.start_time,c1.end_time) as duration_ms,
datepart(dw,c1.start_time) as day_of_week,
datepart(hour,c1.start_time) as hour_of_day,
c3.*,
c5.*
from metrics_table c1
cross apply
(select
avg(cast(datediff(ms,c2.start_time,c2.end_time) as numeric)) as avg_ms,
count(1) as num_process_total,
count(distinct process) as num_process_unique,
count(distinct box) as num_box_unique
from metrics_table c2
where datediff(minute,c2.start_time,c1.start_time) <= 120
and c1.start_time> c2.start_time
and c2.error_code = 0
) c3
cross apply
(select
avg(case when datediff(ms,c4.start_time,c4.end_time)>1000 then 1.0 else 0.0 end) as percent_over_thresh
from metrics_table c4
where datediff(hour,c1.start_time,c4.start_time) <= 1
and c4.start_time> c1.start_time
and c4.error_code= 0
) c5
where
c1.error_code= 0
编辑
版本:SQLAzure 12.0
添加执行计划:
以下应该是朝着正确方向迈出的一步... 注意:c2.start_time & c4.start_time 不再包装在 DATEDIFF 函数中,使它们可 SARGable...
SELECT
c1.start_time,
c1.end_time,
c1.box,
c1.process,
DATEDIFF(ms, c1.start_time, c1.end_time) AS duration_ms,
DATEPART(dw, c1.start_time) AS day_of_week,
DATEPART(HOUR, c1.start_time) AS hour_of_day,
--c3.*,
avg_ms = CASE WHEN
c5.*
FROM
dbo.metrics_table c1
CROSS APPLY (
SELECT
AVG(CAST(DATEDIFF(ms, c2.start_time, c2.end_time) AS NUMERIC)) AS avg_ms,
COUNT(1) AS num_process_total,
COUNT(DISTINCT process) AS num_process_unique,
COUNT(DISTINCT box) AS num_box_unique
FROM
dbo.metrics_table c2
WHERE
--DATEDIFF(minute,c2.start_time,c1.start_time) <= 120
c2.start_time <= DATEADD(MINUTE, -120, c1.start_time)
--and c1.start_time> c2.start_time
AND c2.error_code = 0
) c3
CROSS APPLY (
SELECT
AVG(CASE WHEN DATEDIFF(ms, c4.start_time, c4.end_time) > 1000 THEN 1.0 ELSE 0.0 END
) AS percent_over_thresh
FROM
dbo.metrics_table c4
WHERE
--DATEDIFF(HOUR, c1.start_time, c4.start_time) <= 1
c4.start_time >= DATEADD(HOUR, 1, c1.start_time)
--and c4.start_time> c1.start_time
AND c4.error_code = 0
) c5
WHERE
c1.error_code = 0;
当然,除非有合适的可用索引,否则使查询 SARGable 没有任何好处。以下应该适用于所有 3 metrics_table 个引用...(查看当前可用的索引,有可能您可能不需要创建新索引)
CREATE NONCLUSTERED INDEX ixf_metricstable_errorcode_starttime ON dbo.metrics_table (
error_code,
start_time
)
INCLUDE (
end_time,
box,
process
)
WHERE
error_code = 0;
我使用 Between
并在我的简单测试平台上获得了良好的性能。我还使用了 columnstore,因为 5000 万条记录是 DW 卷:
CREATE TABLE dbo.metrics_table (
rowId INT IDENTITY,
start_time DATETIME NOT NULL,
end_time DATETIME NOT NULL,
box VARCHAR(10) NOT NULL,
process VARCHAR(10) NOT NULL,
error_code INT NOT NULL
);
-- Add records
;WITH cte AS (
SELECT TOP 3334 ROW_NUMBER() OVER ( ORDER BY ( SELECT 1 ) ) rn
FROM sys.columns c1
CROSS JOIN sys.columns c2
CROSS JOIN sys.columns c3
)
INSERT INTO dbo.metrics_table ( start_time, end_time, box, process, error_code )
SELECT
DATEADD( ms, rn, DATEADD( day, rn % 365, '1 Jan 2017' ) ) AS start_time,
DATEADD( ms, rn % 409, DATEADD( ms, rn, DATEADD( day, rn % 365, '1 Jan 2017' ) ) ) AS end_time,
'box' + CAST( boxes.box AS VARCHAR(10) ) box,
'process' + CAST( boxes.box AS VARCHAR(10) ) process,
ABS( CAST( rn % 3000 AS BIT ) -1 ) error_code
FROM cte c
CROSS JOIN ( SELECT TOP 10 rn FROM cte ) AS boxes(box)
CROSS JOIN ( SELECT TOP 30 rn FROM cte ) AS processes(process);
-- Create normal clustered index to order the data
CREATE CLUSTERED INDEX cci_metrics_table ON dbo.metrics_table ( start_time, end_time, box, process );
--CREATE CLUSTERED INDEX cci_metrics_table ON dbo.metrics_table ( box, process, start_time, end_time );
-- Convert to columnstore
CREATE CLUSTERED COLUMNSTORE INDEX cci_metrics_table ON dbo.metrics_table WITH ( MAXDOP = 1, DROP_EXISTING = ON );
IF OBJECT_ID('tempdb..#tmp1' ) IS NOT NULL DROP TABLE #tmp1
-- two hour window before, 1 hour window after
SELECT
c1.start_time,
c1.end_time,
c1.box,
c1.process,
DATEDIFF( ms, c1.start_time, c1.end_time ) AS duration_ms,
DATEPART( dw, c1.start_time ) AS day_of_week,
DATEPART( hour, c1.start_time ) AS hour_of_day,
c2.xavg,
c2.num_process_total,
c2.num_process_unique,
c2.num_box_unique,
c3.percent_over_thresh
INTO #tmp1
FROM dbo.metrics_table c1
CROSS APPLY
(
SELECT
COUNT(1) AS num_process_total,
AVG( CAST( DATEDIFF( ms, start_time, end_time ) AS NUMERIC ) ) xavg,
COUNT( DISTINCT process ) num_process_unique,
COUNT( DISTINCT box ) num_box_unique
FROM dbo.metrics_table c2
WHERE c2.error_code = 0
AND c2.start_time Between DATEADD( minute, -120, c1.start_time ) And c1.start_time
AND c1.start_time > c2.start_time
) c2
CROSS APPLY
(
SELECT
AVG( CASE WHEN DATEDIFF( ms, c4.start_time, c4.end_time ) > 1000 THEN 1.0 ELSE 0.0 END ) percent_over_thresh
FROM dbo.metrics_table c4
WHERE c4.error_code = 0
AND c4.start_time Between c1.start_time And DATEADD( minute, 60, c1.start_time )
AND c4.start_time > c1.start_time
) c3
WHERE error_code = 0