MySQL 选择特定中间项的嵌套聚合查询
MySQL Nested aggregation query with selection of specific intermediate items
这是我的 activities
table.
activities
+----+---------+----------+-----------------+
| id | user_id | activity | log_time |
+----+---------+----------+-----------------+
| 6 | 1 | start | 12 Oct, 1000hrs |
| 2 | 1 | task | 12 Oct, 1010hrs |
| 7 | 1 | task | 12 Oct, 1040hrs |
| 3 | 1 | start | 12 Oct, 1600hrs |
| 1 | 1 | task | 12 Oct, 1610hrs |
| 9 | 1 | start | 14 Oct, 0800hrs |
| 10 | 1 | start | 16 Oct, 0900hrs |
| 4 | 1 | task | 16 Oct, 0910hrs |
| 8 | 2 | start | 12 Oct, 1000hrs |
| 5 | 2 | task | 12 Oct, 1020hrs |
+----+---------+----------+-----------------+
我需要用户在所有会话中花费的总时间。每个会话在一天内发生,包括一个 'start' 和多个 'tasks'(在下一个会话以 'start' 启动之前)。会话持续时间=最后一个任务-开始[时间戳差异]
output
+---------+------------+------------------------------------------------+
| user_id | total_time | This is explanation (not a column) |
+---------+------------+------------------------------------------------+
| 1 | 60 | 12_Oct[40+10] + 14_Oct[0] + 16_Oct[10] = 60min |
| 2 | 20 | 12_Oct[20] = 20min |
+---------+------------+------------------------------------------------+
我不知道如何获取会话中的最后一个任务。我已经尝试了基本的聚合和连接查询 - 但它不起作用。
作为一种方法,我认为我真正需要的是以某种方式获取最后一列(低于/session_group),然后我可以聚合并获得 max/min 时间戳之间的差异。
+----+---------+----------+-----------------+---------------+
| id | user_id | activity | log_time | session_group |
+----+---------+----------+-----------------+---------------+
| 6 | 1 | start | 12 Oct, 1000hrs | 1 |
| 2 | 1 | task | 12 Oct, 1010hrs | 1 |
| 7 | 1 | task | 12 Oct, 1040hrs | 1 |
| 3 | 1 | start | 12 Oct, 1600hrs | 2 |
| 1 | 1 | task | 12 Oct, 1610hrs | 2 |
| 9 | 1 | start | 14 Oct, 0800hrs | 3 |
| 10 | 1 | start | 16 Oct, 0900hrs | 4 |
| 4 | 1 | task | 16 Oct, 0910hrs | 4 |
| 8 | 2 | start | 12 Oct, 1000hrs | 5 |
| 5 | 2 | task | 12 Oct, 1020hrs | 5 |
+----+---------+----------+-----------------+---------------+
请告诉我是否可以通过 sql (MySQL) 获得所需的输出以及如何实现?或者是否有必要通过 Javascript 循环遍历数据?
下面是 MySQL 对 table 的查询:
create table activities (
id INT NOT NULL,
user_id INT NULL,
activity VARCHAR(45),
log_time DATETIME NOT NULL DEFAULT NOW(),
PRIMARY KEY(id))
ENGINE = InnoDB;
insert into activities
(id, user_id, activity, log_time)
values
(6,1,'start', '2021-10-12 10:00:00'),
(2,1,'task' , '2021-10-12 10:10:00'),
(7,1,'task' , '2021-10-12 10:40:00'),
(3,1,'start', '2021-10-12 16:00:00'),
(1,1,'task', '2021-10-12 16:10:00'),
(9,1,'task', '2021-10-14 08:00:00'),
(10,1,'start','2021-10-16 09:00:00'),
(4,1,'task', '2021-10-16 09:10:00'),
(8,2,'start', '2021-10-12 10:00:00'),
(5,2,'task', '2021-10-12 10:20:00');
这可能有效。找出所有的开始-任务对,取最大的分钟差值,然后对每个用户的分钟数求和。
select user_id, sum(minutes) minutes
from (
select a.user_id, a.id, max(timestampdiff(minute, a.log_time, b.log_time)) minutes
from activities a
join activities b on a.user_id = b.user_id and a.log_time < b.log_time
where a.activity = 'start'
and b.activity = 'task'
and date(a.log_time) = date(b.log_time)
and not exists (
select 1
from activities c
where c.user_id = a.user_id
and a.activity = c.activity
and c.log_time > a.log_time
and c.log_time < b.log_time
)
group by a.user_id, a.id
) f
group by user_id
或使用window函数
with combo as
(
select user_id, activity, log_time,
lag(activity) over( partition by user_id order by log_time) last_activity,
lag(log_time) over( partition by user_id order by log_time) last_log_time
from activities
)
select user_id, sum(timestampdiff(minute, last_log_time, log_time))
from combo
where activity = 'task'
and date(log_time) = date(last_log_time)
group by user_id
您可以使用SUM()
window函数为每个会话分配一个数字然后聚合:
SELECT DISTINCT user_id,
SUM(TIMESTAMPDIFF(MINUTE, MIN(log_time), MAX(log_time))) OVER (PARTITION BY user_id) total_time
FROM (
SELECT *, SUM(activity = 'start') OVER (PARTITION BY user_id, DATE(log_time) ORDER BY log_time) grp
FROM activities
) t
WHERE grp > 0
GROUP BY user_id, DATE(log_time), grp;
参见demo。
架构和插入语句:
create table activities (
id INT NOT NULL,
user_id INT NULL,
activity VARCHAR(45),
log_time DATETIME NOT NULL DEFAULT NOW(),
PRIMARY KEY(id))
ENGINE = InnoDB;
insert into activities
(id, user_id, activity, log_time)
values
(6,1,'start', '2021-10-12 10:00:00'),
(2,1,'task' , '2021-10-12 10:10:00'),
(7,1,'task' , '2021-10-12 10:40:00'),
(3,1,'start', '2021-10-12 16:00:00'),
(1,1,'task', '2021-10-12 16:10:00'),
(9,1,'start', '2021-10-14 08:00:00'),
(10,1,'start','2021-10-16 09:00:00'),
(4,1,'task', '2021-10-16 09:10:00'),
(8,2,'start', '2021-10-12 10:00:00'),
(5,2,'task', '2021-10-12 10:20:00');
查询:
with tasks as
(
SELECT
user_id, partition_condition ,TIMESTAMPDIFF(minute,min(log_time),max(log_time))time_diff
FROM (
SELECT
id, user_id, activity, log_time,
sum(case when activity='start' then 1 else 0 end) over (partition by user_id order by log_time) as partition_condition
FROM activities
) as tasks
group by user_id, partition_condition
)
select user_id,sum(time_diff)total_time from tasks
group by user_id
输出:
user_id
total_time
1
60
2
20
db<>fiddle here
您可以针对每个 user 和 day 以及 LAG() window 函数以计算每行的分钟差异 task activity 例如
SELECT user_id,
SUM( TIMESTAMPDIFF(MINUTE, COALESCE( lg, log_time ), log_time ) ) AS total_time
FROM (SELECT LAG(log_time) OVER (PARTITION BY user_id, DATE(log_time)
ORDER BY log_time) AS lg,
a.*
FROM activities AS a
ORDER BY log_time) AS aa
WHERE activity != 'start'
GROUP BY user_id
这是我的 activities
table.
activities
+----+---------+----------+-----------------+
| id | user_id | activity | log_time |
+----+---------+----------+-----------------+
| 6 | 1 | start | 12 Oct, 1000hrs |
| 2 | 1 | task | 12 Oct, 1010hrs |
| 7 | 1 | task | 12 Oct, 1040hrs |
| 3 | 1 | start | 12 Oct, 1600hrs |
| 1 | 1 | task | 12 Oct, 1610hrs |
| 9 | 1 | start | 14 Oct, 0800hrs |
| 10 | 1 | start | 16 Oct, 0900hrs |
| 4 | 1 | task | 16 Oct, 0910hrs |
| 8 | 2 | start | 12 Oct, 1000hrs |
| 5 | 2 | task | 12 Oct, 1020hrs |
+----+---------+----------+-----------------+
我需要用户在所有会话中花费的总时间。每个会话在一天内发生,包括一个 'start' 和多个 'tasks'(在下一个会话以 'start' 启动之前)。会话持续时间=最后一个任务-开始[时间戳差异]
output
+---------+------------+------------------------------------------------+
| user_id | total_time | This is explanation (not a column) |
+---------+------------+------------------------------------------------+
| 1 | 60 | 12_Oct[40+10] + 14_Oct[0] + 16_Oct[10] = 60min |
| 2 | 20 | 12_Oct[20] = 20min |
+---------+------------+------------------------------------------------+
我不知道如何获取会话中的最后一个任务。我已经尝试了基本的聚合和连接查询 - 但它不起作用。
作为一种方法,我认为我真正需要的是以某种方式获取最后一列(低于/session_group),然后我可以聚合并获得 max/min 时间戳之间的差异。
+----+---------+----------+-----------------+---------------+
| id | user_id | activity | log_time | session_group |
+----+---------+----------+-----------------+---------------+
| 6 | 1 | start | 12 Oct, 1000hrs | 1 |
| 2 | 1 | task | 12 Oct, 1010hrs | 1 |
| 7 | 1 | task | 12 Oct, 1040hrs | 1 |
| 3 | 1 | start | 12 Oct, 1600hrs | 2 |
| 1 | 1 | task | 12 Oct, 1610hrs | 2 |
| 9 | 1 | start | 14 Oct, 0800hrs | 3 |
| 10 | 1 | start | 16 Oct, 0900hrs | 4 |
| 4 | 1 | task | 16 Oct, 0910hrs | 4 |
| 8 | 2 | start | 12 Oct, 1000hrs | 5 |
| 5 | 2 | task | 12 Oct, 1020hrs | 5 |
+----+---------+----------+-----------------+---------------+
请告诉我是否可以通过 sql (MySQL) 获得所需的输出以及如何实现?或者是否有必要通过 Javascript 循环遍历数据?
下面是 MySQL 对 table 的查询:
create table activities (
id INT NOT NULL,
user_id INT NULL,
activity VARCHAR(45),
log_time DATETIME NOT NULL DEFAULT NOW(),
PRIMARY KEY(id))
ENGINE = InnoDB;
insert into activities
(id, user_id, activity, log_time)
values
(6,1,'start', '2021-10-12 10:00:00'),
(2,1,'task' , '2021-10-12 10:10:00'),
(7,1,'task' , '2021-10-12 10:40:00'),
(3,1,'start', '2021-10-12 16:00:00'),
(1,1,'task', '2021-10-12 16:10:00'),
(9,1,'task', '2021-10-14 08:00:00'),
(10,1,'start','2021-10-16 09:00:00'),
(4,1,'task', '2021-10-16 09:10:00'),
(8,2,'start', '2021-10-12 10:00:00'),
(5,2,'task', '2021-10-12 10:20:00');
这可能有效。找出所有的开始-任务对,取最大的分钟差值,然后对每个用户的分钟数求和。
select user_id, sum(minutes) minutes
from (
select a.user_id, a.id, max(timestampdiff(minute, a.log_time, b.log_time)) minutes
from activities a
join activities b on a.user_id = b.user_id and a.log_time < b.log_time
where a.activity = 'start'
and b.activity = 'task'
and date(a.log_time) = date(b.log_time)
and not exists (
select 1
from activities c
where c.user_id = a.user_id
and a.activity = c.activity
and c.log_time > a.log_time
and c.log_time < b.log_time
)
group by a.user_id, a.id
) f
group by user_id
或使用window函数
with combo as
(
select user_id, activity, log_time,
lag(activity) over( partition by user_id order by log_time) last_activity,
lag(log_time) over( partition by user_id order by log_time) last_log_time
from activities
)
select user_id, sum(timestampdiff(minute, last_log_time, log_time))
from combo
where activity = 'task'
and date(log_time) = date(last_log_time)
group by user_id
您可以使用SUM()
window函数为每个会话分配一个数字然后聚合:
SELECT DISTINCT user_id,
SUM(TIMESTAMPDIFF(MINUTE, MIN(log_time), MAX(log_time))) OVER (PARTITION BY user_id) total_time
FROM (
SELECT *, SUM(activity = 'start') OVER (PARTITION BY user_id, DATE(log_time) ORDER BY log_time) grp
FROM activities
) t
WHERE grp > 0
GROUP BY user_id, DATE(log_time), grp;
参见demo。
架构和插入语句:
create table activities (
id INT NOT NULL,
user_id INT NULL,
activity VARCHAR(45),
log_time DATETIME NOT NULL DEFAULT NOW(),
PRIMARY KEY(id))
ENGINE = InnoDB;
insert into activities
(id, user_id, activity, log_time)
values
(6,1,'start', '2021-10-12 10:00:00'),
(2,1,'task' , '2021-10-12 10:10:00'),
(7,1,'task' , '2021-10-12 10:40:00'),
(3,1,'start', '2021-10-12 16:00:00'),
(1,1,'task', '2021-10-12 16:10:00'),
(9,1,'start', '2021-10-14 08:00:00'),
(10,1,'start','2021-10-16 09:00:00'),
(4,1,'task', '2021-10-16 09:10:00'),
(8,2,'start', '2021-10-12 10:00:00'),
(5,2,'task', '2021-10-12 10:20:00');
查询:
with tasks as
(
SELECT
user_id, partition_condition ,TIMESTAMPDIFF(minute,min(log_time),max(log_time))time_diff
FROM (
SELECT
id, user_id, activity, log_time,
sum(case when activity='start' then 1 else 0 end) over (partition by user_id order by log_time) as partition_condition
FROM activities
) as tasks
group by user_id, partition_condition
)
select user_id,sum(time_diff)total_time from tasks
group by user_id
输出:
user_id | total_time |
---|---|
1 | 60 |
2 | 20 |
db<>fiddle here
您可以针对每个 user 和 day 以及 LAG() window 函数以计算每行的分钟差异 task activity 例如
SELECT user_id,
SUM( TIMESTAMPDIFF(MINUTE, COALESCE( lg, log_time ), log_time ) ) AS total_time
FROM (SELECT LAG(log_time) OVER (PARTITION BY user_id, DATE(log_time)
ORDER BY log_time) AS lg,
a.*
FROM activities AS a
ORDER BY log_time) AS aa
WHERE activity != 'start'
GROUP BY user_id