T-SQL:统计失败次数,直到第一次成功
T-SQL: Count number of failures until first success
我有一个数据库,其中包含带有时间戳的事件:
row eventName taskName timestamp userName
1 fail ABC 10.5 John
2 fail ABC 18.0 John
3 fail ABC 19.0 Mike
4 fail XYZ 21.0 John
5 fail XYZ 23.0 Mike
6 success ABC 25.0 John
7 fail ABC 26.0 John
8 success ABC 28.0 John
我想计算每个用户在第一次成功之前的失败次数(和平均值,但这超出了这个问题)。
在上面的示例中,John 尝试了任务 ABC 2 次(第 1 行和第 2 行)直到成功(第 6 行)。随后的失败和成功可以忽略。
我想我可以通过计算 "ABC" 和 "fail" 的行数来实现这一点,这些行的时间戳早于 "ABC" 和 [=30 的所有行中最早的时间戳=],按用户名分组。我如何在 T-SQL 中表达它?具体来说,Vertica。
这似乎与这里的情况非常相似:
但是当我尝试如下调整 中的代码时,我想我弄错了,因为我得到的计数总是比预期的要大。
WITH
Successes
AS
(
SELECT
events.userName
,events.taskName
,MIN(events.timestamp) AS FirstSuccessTime
FROM events
WHERE events.eventName = 'success'
GROUP BY events.userName, events.taskName
)
SELECT
events.userName
,events.taskName
,COUNT(events.eventName) AS FailuresUntilFirstSuccess
FROM
Successes
LEFT JOIN events
ON events.taskName = Successes.taskName
AND events.timestamp < Successes.FirstSuccessTime
AND events.eventName = 'fail'
GROUP BY events.userName, events.taskName
;
在SQLServer或Vertica中,可以使用window函数统计每行前的成功次数。然后将其用于聚合:
select username,
sum(case when success_cnt = 0 and eventName = 'failure' then 1 else 0 end) as numfailures_to_first_success
from (select e.*,
sum(case when e.eventName = 'Success' then 1 else 0 end) over (partition by e.username order by e.row) as success_cnt
from events e
) e
group by username;
注意:如果您希望针对每个任务执行此操作,请将其包含在 partition by
和 group by
中。
Updated and Corrected - Missed userName
可能带有交叉应用和 WITH TIES 子句。
顶部子查询将 return 仅前 "success" 条按任务名称和用户名分区的记录。在这种情况下,只有第 6 行会被 returned。
例子
Select A.*
,B.*
From (
Select Top 1 with ties *
from YourTable
Where [eventName]='success'
Order By Row_Number() over (Partition By taskName,userName Order by [row])
) A
Cross Apply (
Select Cnt=count(*)
From YourTable
Where taskName=A.taskName and A.userName=userName and [row]<A.[Row] and eventName='fail'
) B
Returns
使用 CTEs
的另一种解决方案。这个想法是先接收带有 success
的最小行。之后我们可以使用简单的条件过滤掉其他行,比如 row < min
per user:
DECLARE @events TABLE([row] INT,eventName VARCHAR(50),taskName VARCHAR(50),userName VARCHAR(50))
INSERT INTO @events
VALUES(1,'fail','ABC','John')
,(2,'fail','ABC','John')
,(3,'fail','ABC','Mike')
,(4,'fail','XYZ','John')
,(5,'fail','XYZ','Mike')
,(6,'success','ABC','John')
,(7,'fail','ABC','John')
,(8,'success','ABC','John')
,(9,'success','ABC','Mike')
SELECT * FROM @events;
WITH
cte
AS (
SELECT userName
,taskName
,MIN(row) AS [min]
FROM @events
WHERE eventName = 'success'
GROUP BY userName,taskName)
SELECT e.userName
,e.taskName
,COUNT(1) AS attempts
FROM @events e
JOIN cte c ON e.userName = c.userName
WHERE e.row < c.[min]
GROUP BY e.userName, e.taskName
解决方案
基于架构,此查询将为您提供所需的信息:
with Failures as
(
select * from Event where event_name = 'fail'
),
Q as
(
select * from Event E
outer apply
(
select count(*) cnt from Failures F
where F.task_name = E.task_name and F.username = E.username and F.ts < E.ts
) F
where E.event_name = 'success'
)
select * from
(
select Q.*,
row_number() over (partition by event_name, task_name, username order by ts) o from Q
) K where K.o = 1
使用您的数据进行测试:
id event_name task_name timestamp username cnt
-- ---------- ---------- ---------- --------- ---
6 success ABC 25 John 2
但是,我更进一步,为迈克添加了另一个 'success' 行
insert Event select 'success', 'XYZ', 29.0, 'Mike':
并得到
id event_name task_name timestamp username cnt
-- ---------- ---------- ---------- --------- ---
6 success ABC 25 John 2
9 success XYZ 29 Mike 1
符合预期。
说明
第一个 CTE 产生了一组故障。第二个 CTE 是递归的,其中基本情况是成功集,递归情况是给定成功之前的失败集的计数(基数)(相对于用户和任务名称)。
最后,我们使用 row_number
对 event_name
、task_name
和 username
进行分区,因此给定分区的第一次成功将被标记为'1'。然后我们过滤掉所有 row_number
不等于 '1' 的行。
也有可能的解决方案:
SELECT *, COUNT(*) OVER (PARTITION BY T1.userName, T1.taskName)
FROM tbl AS T1
WHERE T1.row < ( SELECT MIN(row)
FROM tbl AS T2
WHERE T2.userName = T1.userName
AND T2.taskName = T1.taskName
AND T2.eventName = 'success');
与 Artem 类似的解决方案,但略有不同。子查询将为成功的每个用户和任务找到最低的行值,并将基于该值过滤行。
这里可能有更简单的方法,但我稍后会尝试查看更多内容。
测试数据设置
IF OBJECT_ID(N'tempdb..#taskevents', N'U') IS NOT NULL
DROP TABLE #taskevents;
GO
CREATE TABLE #taskevents (
eventName varchar(10)
, taskName varchar(10)
, ts decimal(3,1)
, userName varchar(10)
) ;
INSERT INTO #taskevents ( eventName, taskName, ts, userName )
VALUES
('fail','ABC','10.5','John')
, ('fail','ABC','10.6','John')
, ('fail','ABC','18.0','John')
, ('fail','ABC','22.0','John')
, ('fail','ABC','22.5','John')
, ('success','ABC','25.0','John')
, ('fail','ABC','26.0','John')
, ('success','ABC','28.0','John')
, ('fail','XYZ','10.7','John')
, ('fail','XYZ','21.0','John')
, ('fail','ABC','19.0','Mike')
, ('fail','XYZ','23.0','Mike')
, ('success','XYZ','28.5','Mike')
, ('success','QVC','42.0','Mike')
;
查询时间
SELECT s3.userName, s3.taskName, AVG(s3.failCount) AS avgFailCount
FROM (
SELECT s1.userName, s1.taskName, s1.ts, s1.PreviousTS, COALESCE(s2.failCount,0) AS failCount
FROM (
SELECT t1.userName, t1.taskName, t1.ts, LAG(t1.ts) OVER (PARTITION BY t1.userName, t1.taskName ORDER BY t1.ts) AS PreviousTS --ROW_NUMBER() OVER (PARTITION BY t1.userName ORDER BY t1.ts) AS rn
FROM #taskevents t1
WHERE t1.eventName = 'success'
) s1
OUTER APPLY (
SELECT t2.userName, t2.taskName, COUNT(*) AS failCount
FROM #taskevents t2
WHERE t2.eventName = 'fail'
AND t2.userName = s1.userName
AND t2.taskName = s1.taskName
AND t2.ts < s1.ts
AND ( t2.ts >= s1.PreviousTS OR s1.PreviousTS IS NULL )
GROUP BY t2.userName, t2.taskName
) s2
) s3
GROUP BY s3.userName, s3.taskName
这就是每个用户的平均失败率。
我有一个数据库,其中包含带有时间戳的事件:
row eventName taskName timestamp userName
1 fail ABC 10.5 John
2 fail ABC 18.0 John
3 fail ABC 19.0 Mike
4 fail XYZ 21.0 John
5 fail XYZ 23.0 Mike
6 success ABC 25.0 John
7 fail ABC 26.0 John
8 success ABC 28.0 John
我想计算每个用户在第一次成功之前的失败次数(和平均值,但这超出了这个问题)。
在上面的示例中,John 尝试了任务 ABC 2 次(第 1 行和第 2 行)直到成功(第 6 行)。随后的失败和成功可以忽略。
我想我可以通过计算 "ABC" 和 "fail" 的行数来实现这一点,这些行的时间戳早于 "ABC" 和 [=30 的所有行中最早的时间戳=],按用户名分组。我如何在 T-SQL 中表达它?具体来说,Vertica。
这似乎与这里的情况非常相似:
但是当我尝试如下调整
WITH
Successes
AS
(
SELECT
events.userName
,events.taskName
,MIN(events.timestamp) AS FirstSuccessTime
FROM events
WHERE events.eventName = 'success'
GROUP BY events.userName, events.taskName
)
SELECT
events.userName
,events.taskName
,COUNT(events.eventName) AS FailuresUntilFirstSuccess
FROM
Successes
LEFT JOIN events
ON events.taskName = Successes.taskName
AND events.timestamp < Successes.FirstSuccessTime
AND events.eventName = 'fail'
GROUP BY events.userName, events.taskName
;
在SQLServer或Vertica中,可以使用window函数统计每行前的成功次数。然后将其用于聚合:
select username,
sum(case when success_cnt = 0 and eventName = 'failure' then 1 else 0 end) as numfailures_to_first_success
from (select e.*,
sum(case when e.eventName = 'Success' then 1 else 0 end) over (partition by e.username order by e.row) as success_cnt
from events e
) e
group by username;
注意:如果您希望针对每个任务执行此操作,请将其包含在 partition by
和 group by
中。
Updated and Corrected - Missed userName
可能带有交叉应用和 WITH TIES 子句。
顶部子查询将 return 仅前 "success" 条按任务名称和用户名分区的记录。在这种情况下,只有第 6 行会被 returned。
例子
Select A.*
,B.*
From (
Select Top 1 with ties *
from YourTable
Where [eventName]='success'
Order By Row_Number() over (Partition By taskName,userName Order by [row])
) A
Cross Apply (
Select Cnt=count(*)
From YourTable
Where taskName=A.taskName and A.userName=userName and [row]<A.[Row] and eventName='fail'
) B
Returns
使用 CTEs
的另一种解决方案。这个想法是先接收带有 success
的最小行。之后我们可以使用简单的条件过滤掉其他行,比如 row < min
per user:
DECLARE @events TABLE([row] INT,eventName VARCHAR(50),taskName VARCHAR(50),userName VARCHAR(50))
INSERT INTO @events
VALUES(1,'fail','ABC','John')
,(2,'fail','ABC','John')
,(3,'fail','ABC','Mike')
,(4,'fail','XYZ','John')
,(5,'fail','XYZ','Mike')
,(6,'success','ABC','John')
,(7,'fail','ABC','John')
,(8,'success','ABC','John')
,(9,'success','ABC','Mike')
SELECT * FROM @events;
WITH
cte
AS (
SELECT userName
,taskName
,MIN(row) AS [min]
FROM @events
WHERE eventName = 'success'
GROUP BY userName,taskName)
SELECT e.userName
,e.taskName
,COUNT(1) AS attempts
FROM @events e
JOIN cte c ON e.userName = c.userName
WHERE e.row < c.[min]
GROUP BY e.userName, e.taskName
解决方案
基于架构,此查询将为您提供所需的信息:
with Failures as
(
select * from Event where event_name = 'fail'
),
Q as
(
select * from Event E
outer apply
(
select count(*) cnt from Failures F
where F.task_name = E.task_name and F.username = E.username and F.ts < E.ts
) F
where E.event_name = 'success'
)
select * from
(
select Q.*,
row_number() over (partition by event_name, task_name, username order by ts) o from Q
) K where K.o = 1
使用您的数据进行测试:
id event_name task_name timestamp username cnt
-- ---------- ---------- ---------- --------- ---
6 success ABC 25 John 2
但是,我更进一步,为迈克添加了另一个 'success' 行
insert Event select 'success', 'XYZ', 29.0, 'Mike':
并得到
id event_name task_name timestamp username cnt
-- ---------- ---------- ---------- --------- ---
6 success ABC 25 John 2
9 success XYZ 29 Mike 1
符合预期。
说明
第一个 CTE 产生了一组故障。第二个 CTE 是递归的,其中基本情况是成功集,递归情况是给定成功之前的失败集的计数(基数)(相对于用户和任务名称)。
最后,我们使用 row_number
对 event_name
、task_name
和 username
进行分区,因此给定分区的第一次成功将被标记为'1'。然后我们过滤掉所有 row_number
不等于 '1' 的行。
也有可能的解决方案:
SELECT *, COUNT(*) OVER (PARTITION BY T1.userName, T1.taskName)
FROM tbl AS T1
WHERE T1.row < ( SELECT MIN(row)
FROM tbl AS T2
WHERE T2.userName = T1.userName
AND T2.taskName = T1.taskName
AND T2.eventName = 'success');
与 Artem 类似的解决方案,但略有不同。子查询将为成功的每个用户和任务找到最低的行值,并将基于该值过滤行。
这里可能有更简单的方法,但我稍后会尝试查看更多内容。
测试数据设置
IF OBJECT_ID(N'tempdb..#taskevents', N'U') IS NOT NULL
DROP TABLE #taskevents;
GO
CREATE TABLE #taskevents (
eventName varchar(10)
, taskName varchar(10)
, ts decimal(3,1)
, userName varchar(10)
) ;
INSERT INTO #taskevents ( eventName, taskName, ts, userName )
VALUES
('fail','ABC','10.5','John')
, ('fail','ABC','10.6','John')
, ('fail','ABC','18.0','John')
, ('fail','ABC','22.0','John')
, ('fail','ABC','22.5','John')
, ('success','ABC','25.0','John')
, ('fail','ABC','26.0','John')
, ('success','ABC','28.0','John')
, ('fail','XYZ','10.7','John')
, ('fail','XYZ','21.0','John')
, ('fail','ABC','19.0','Mike')
, ('fail','XYZ','23.0','Mike')
, ('success','XYZ','28.5','Mike')
, ('success','QVC','42.0','Mike')
;
查询时间
SELECT s3.userName, s3.taskName, AVG(s3.failCount) AS avgFailCount
FROM (
SELECT s1.userName, s1.taskName, s1.ts, s1.PreviousTS, COALESCE(s2.failCount,0) AS failCount
FROM (
SELECT t1.userName, t1.taskName, t1.ts, LAG(t1.ts) OVER (PARTITION BY t1.userName, t1.taskName ORDER BY t1.ts) AS PreviousTS --ROW_NUMBER() OVER (PARTITION BY t1.userName ORDER BY t1.ts) AS rn
FROM #taskevents t1
WHERE t1.eventName = 'success'
) s1
OUTER APPLY (
SELECT t2.userName, t2.taskName, COUNT(*) AS failCount
FROM #taskevents t2
WHERE t2.eventName = 'fail'
AND t2.userName = s1.userName
AND t2.taskName = s1.taskName
AND t2.ts < s1.ts
AND ( t2.ts >= s1.PreviousTS OR s1.PreviousTS IS NULL )
GROUP BY t2.userName, t2.taskName
) s2
) s3
GROUP BY s3.userName, s3.taskName
这就是每个用户的平均失败率。