在考虑重叠活动的情况下计算用户 "busy duration"
Calculating users' "busy duration" taking into account overlapping activities
我一直在尝试解决这个问题,但到目前为止还没有成功。我正在使用 Oracle。
我有一组数据如下所示:
| USER | ACTIVITY | START_TIME | END_TIME | DURATION |
|--------|------------|-----------------|-----------------|----------|
| jsmith | Front Desk | 2020-08-24 8:00 | 2020-08-24 9:30 | 90 |
| jsmith | Phones | 2020-08-24 8:15 | 2020-08-24 8:45 | 30 |
| jsmith | Phones | 2020-08-24 9:45 | 2020-08-24 9:50 | 5 |
| bjones | Phones | 2020-08-24 9:00 | 2020-08-24 9:10 | 10 |
| bjones | Front Desk | 2020-08-24 9:05 | 2020-08-24 9:15 | 10 |
| bjones | Phones | 2020-08-24 9:15 | 2020-08-24 9:45 | 30 |
以上输出可以通过以下查询生成:
SELECT
USER,
ACTIVITY,
START_TIME,
END_TIME,
DURATION
FROM USER_ACTIVITIES
WHERE USER IN ('jsmith', 'bjones')
AND START_TIME BETWEEN '2020-08-24 00:00:00' AND '2020-08-25 00:00:00'
ORDER BY USER, START_TIME, END_TIME
;
我需要计算每个用户的总“忙碌”时间,同时考虑到一些活动相互重叠。使用现有查询,我将得到每个用户的总持续时间,jsmith 为 125,bjones 为 50,但是由于某些活动重叠,这并不能反映用户忙碌的总时间。
我正在寻找的输出是用户每天的总忙碌持续时间:
| USER | DATE | DURATION |
|--------|------------|----------|
| jsmith | 2020-08-24 | 95 |
| bjones | 2020-08-24 | 45 |
如有任何帮助,我们将不胜感激。
您可以先对分钟进行逆轴旋转,然后通过使用 NOT EXISTS
免除 non-overlapping 间隔(由于这种情况,我没有考虑天间隔,您如果需要其他计算案例,可以添加 EXTRACT( hour FROM max_end_time - min_start_time )*3600
)
WITH t AS
(
SELECT "user" , MIN(start_time) AS min_start_time, MAX(end_time) AS max_end_time
FROM user_activities
GROUP BY "user"
), t2 AS
(
SELECT "user", min_start_time + NUMTODSINTERVAL(level, 'minute') AS minutes
FROM t
CONNECT BY level <= EXTRACT( hour FROM max_end_time - min_start_time )*60 +
EXTRACT( minute FROM max_end_time - min_start_time )
AND PRIOR SYS_GUID() IS NOT NULL
AND PRIOR "user" = "user"
)
SELECT "user", COUNT(*) AS "Duration"
FROM t2
WHERE EXISTS ( SELECT *
FROM user_activities
WHERE minutes BETWEEN start_time and end_time
AND "user" = t2."user" )
GROUP BY "user"
我会用 gaps-and-islands 技术而不是递归来解决这个问题:
select usr, sum(duration) * 24 * 60 duration
from (
select usr, max(end_time) - min(start_time) duration
from (
select
ua.*,
sum(case when start_time <= lag_end_time then 0 else 1 end) over(partition by usr order by start_time) grp
from (
select
ua.*,
lag(end_time) over(partition by usr order by start_time) lag_end_time
from user_activities ua
) ua
) ua
group by usr, grp
) ua
group by usr
我们的想法是使用 window 总和来构建具有相同用户和重叠时间段的记录组。然后,您可以计算每个“岛”的终点和起点之间的差异,最后对每个用户进行汇总。
许多可能的解决方案。这是另一个:使用 CTE,首先使用 LEAD 函数计算干净的结束时间(如果后续开始时间早于结束时间,则取后续开始时间)。然后按用户求和分组:
WITH sampledata (username,activity,start_time,end_time)
AS
(
SELECT 'jsmith', 'Front Desk' ,'2020-08-24 8:00','2020-08-24 9:30' FROM DUAL UNION ALL
SELECT 'jsmith', 'Phones' ,'2020-08-24 8:15','2020-08-24 8:45' FROM DUAL UNION ALL
SELECT 'jsmith', 'Phones' ,'2020-08-24 9:45','2020-08-24 9:50' FROM DUAL UNION ALL
SELECT 'bjones', 'Phones' ,'2020-08-24 9:00','2020-08-24 9:10' FROM DUAL UNION ALL
SELECT 'bjones', 'Front Desk' ,'2020-08-24 9:05','2020-08-24 9:15' FROM DUAL UNION ALL
SELECT 'bjones', 'Phones' ,'2020-08-24 9:15','2020-08-24 9:45' FROM DUAL
), clean_sampledata (username,activity,start_time,end_time)
AS
(
SELECT
username,
activity,
TO_DATE(start_time,'YYYY-MM-DD HH24:MI'),
TO_DATE(end_time,'YYYY-MM-DD HH24:MI')
FROM sampledata
), clear_overlapped (username,activity,start_time,clean_end_time)
AS
(
SELECT
username,
activity,
start_time,
NVL(LEAST(LEAD(start_time) OVER (PARTITION BY username ORDER BY start_time),end_time),end_time)
FROM clean_sampledata
), cleaned_minutes_per_username (username,mins)
AS
(
SELECT
username,
ROUND((clean_end_time - start_time) * 1440)
FROM clear_overlapped
)
SELECT
username,
SUM(mins)
FROM cleaned_minutes_per_username
GROUP BY username ;
bjones 45
jsmith 50
以下代码至少需要 12c:
WITH user_activities( "user", activity, start_time, end_time ) AS
(
SELECT 'jsmith', 'Front Desk', timestamp'2020-08-24 08:00:00' , timestamp'2020-08-24 09:30:00' FROM dual UNION ALL
SELECT 'jsmith', 'Phones' , timestamp'2020-08-24 08:15:00' , timestamp'2020-08-24 08:45:00' FROM dual UNION ALL
SELECT 'jsmith', 'Phones' , timestamp'2020-08-24 09:45:00' , timestamp'2020-08-24 09:50:00' FROM dual UNION ALL
SELECT 'bjones', 'Phones' , timestamp'2020-08-24 09:00:00' , timestamp'2020-08-24 09:10:00' FROM dual UNION ALL
SELECT 'bjones', 'Front Desk', timestamp'2020-08-24 09:05:00' , timestamp'2020-08-24 09:15:00' FROM dual UNION ALL
SELECT 'bjones', 'Phones' , timestamp'2020-08-24 09:15:00' , timestamp'2020-08-24 09:45:00' FROM dual
)
select "user", sum(durations) as durations
from
(
select "user", extract(hour from (end_time - start_time)) * 60 + extract(minute from (end_time - start_time)) as durations
from user_activities
match_recognize
(
partition by "user"
order by start_time, end_time
measures first(start_time) start_time, max(end_time) as end_time
pattern (a* b)
define a as max(end_time) >= next(start_time)
)
)
group by "user";
如果您对 match_recognize
感兴趣,这应该可以解决您的问题
输出:
我一直在尝试解决这个问题,但到目前为止还没有成功。我正在使用 Oracle。
我有一组数据如下所示:
| USER | ACTIVITY | START_TIME | END_TIME | DURATION |
|--------|------------|-----------------|-----------------|----------|
| jsmith | Front Desk | 2020-08-24 8:00 | 2020-08-24 9:30 | 90 |
| jsmith | Phones | 2020-08-24 8:15 | 2020-08-24 8:45 | 30 |
| jsmith | Phones | 2020-08-24 9:45 | 2020-08-24 9:50 | 5 |
| bjones | Phones | 2020-08-24 9:00 | 2020-08-24 9:10 | 10 |
| bjones | Front Desk | 2020-08-24 9:05 | 2020-08-24 9:15 | 10 |
| bjones | Phones | 2020-08-24 9:15 | 2020-08-24 9:45 | 30 |
以上输出可以通过以下查询生成:
SELECT
USER,
ACTIVITY,
START_TIME,
END_TIME,
DURATION
FROM USER_ACTIVITIES
WHERE USER IN ('jsmith', 'bjones')
AND START_TIME BETWEEN '2020-08-24 00:00:00' AND '2020-08-25 00:00:00'
ORDER BY USER, START_TIME, END_TIME
;
我需要计算每个用户的总“忙碌”时间,同时考虑到一些活动相互重叠。使用现有查询,我将得到每个用户的总持续时间,jsmith 为 125,bjones 为 50,但是由于某些活动重叠,这并不能反映用户忙碌的总时间。
我正在寻找的输出是用户每天的总忙碌持续时间:
| USER | DATE | DURATION |
|--------|------------|----------|
| jsmith | 2020-08-24 | 95 |
| bjones | 2020-08-24 | 45 |
如有任何帮助,我们将不胜感激。
您可以先对分钟进行逆轴旋转,然后通过使用 NOT EXISTS
免除 non-overlapping 间隔(由于这种情况,我没有考虑天间隔,您如果需要其他计算案例,可以添加 EXTRACT( hour FROM max_end_time - min_start_time )*3600
)
WITH t AS
(
SELECT "user" , MIN(start_time) AS min_start_time, MAX(end_time) AS max_end_time
FROM user_activities
GROUP BY "user"
), t2 AS
(
SELECT "user", min_start_time + NUMTODSINTERVAL(level, 'minute') AS minutes
FROM t
CONNECT BY level <= EXTRACT( hour FROM max_end_time - min_start_time )*60 +
EXTRACT( minute FROM max_end_time - min_start_time )
AND PRIOR SYS_GUID() IS NOT NULL
AND PRIOR "user" = "user"
)
SELECT "user", COUNT(*) AS "Duration"
FROM t2
WHERE EXISTS ( SELECT *
FROM user_activities
WHERE minutes BETWEEN start_time and end_time
AND "user" = t2."user" )
GROUP BY "user"
我会用 gaps-and-islands 技术而不是递归来解决这个问题:
select usr, sum(duration) * 24 * 60 duration
from (
select usr, max(end_time) - min(start_time) duration
from (
select
ua.*,
sum(case when start_time <= lag_end_time then 0 else 1 end) over(partition by usr order by start_time) grp
from (
select
ua.*,
lag(end_time) over(partition by usr order by start_time) lag_end_time
from user_activities ua
) ua
) ua
group by usr, grp
) ua
group by usr
我们的想法是使用 window 总和来构建具有相同用户和重叠时间段的记录组。然后,您可以计算每个“岛”的终点和起点之间的差异,最后对每个用户进行汇总。
许多可能的解决方案。这是另一个:使用 CTE,首先使用 LEAD 函数计算干净的结束时间(如果后续开始时间早于结束时间,则取后续开始时间)。然后按用户求和分组:
WITH sampledata (username,activity,start_time,end_time)
AS
(
SELECT 'jsmith', 'Front Desk' ,'2020-08-24 8:00','2020-08-24 9:30' FROM DUAL UNION ALL
SELECT 'jsmith', 'Phones' ,'2020-08-24 8:15','2020-08-24 8:45' FROM DUAL UNION ALL
SELECT 'jsmith', 'Phones' ,'2020-08-24 9:45','2020-08-24 9:50' FROM DUAL UNION ALL
SELECT 'bjones', 'Phones' ,'2020-08-24 9:00','2020-08-24 9:10' FROM DUAL UNION ALL
SELECT 'bjones', 'Front Desk' ,'2020-08-24 9:05','2020-08-24 9:15' FROM DUAL UNION ALL
SELECT 'bjones', 'Phones' ,'2020-08-24 9:15','2020-08-24 9:45' FROM DUAL
), clean_sampledata (username,activity,start_time,end_time)
AS
(
SELECT
username,
activity,
TO_DATE(start_time,'YYYY-MM-DD HH24:MI'),
TO_DATE(end_time,'YYYY-MM-DD HH24:MI')
FROM sampledata
), clear_overlapped (username,activity,start_time,clean_end_time)
AS
(
SELECT
username,
activity,
start_time,
NVL(LEAST(LEAD(start_time) OVER (PARTITION BY username ORDER BY start_time),end_time),end_time)
FROM clean_sampledata
), cleaned_minutes_per_username (username,mins)
AS
(
SELECT
username,
ROUND((clean_end_time - start_time) * 1440)
FROM clear_overlapped
)
SELECT
username,
SUM(mins)
FROM cleaned_minutes_per_username
GROUP BY username ;
bjones 45
jsmith 50
以下代码至少需要 12c:
WITH user_activities( "user", activity, start_time, end_time ) AS
(
SELECT 'jsmith', 'Front Desk', timestamp'2020-08-24 08:00:00' , timestamp'2020-08-24 09:30:00' FROM dual UNION ALL
SELECT 'jsmith', 'Phones' , timestamp'2020-08-24 08:15:00' , timestamp'2020-08-24 08:45:00' FROM dual UNION ALL
SELECT 'jsmith', 'Phones' , timestamp'2020-08-24 09:45:00' , timestamp'2020-08-24 09:50:00' FROM dual UNION ALL
SELECT 'bjones', 'Phones' , timestamp'2020-08-24 09:00:00' , timestamp'2020-08-24 09:10:00' FROM dual UNION ALL
SELECT 'bjones', 'Front Desk', timestamp'2020-08-24 09:05:00' , timestamp'2020-08-24 09:15:00' FROM dual UNION ALL
SELECT 'bjones', 'Phones' , timestamp'2020-08-24 09:15:00' , timestamp'2020-08-24 09:45:00' FROM dual
)
select "user", sum(durations) as durations
from
(
select "user", extract(hour from (end_time - start_time)) * 60 + extract(minute from (end_time - start_time)) as durations
from user_activities
match_recognize
(
partition by "user"
order by start_time, end_time
measures first(start_time) start_time, max(end_time) as end_time
pattern (a* b)
define a as max(end_time) >= next(start_time)
)
)
group by "user";
如果您对 match_recognize
感兴趣,这应该可以解决您的问题输出: