检索连续行之间具有最小时间间隔的 ID
Retrieve IDs with a minimum time gap between consecutive rows
我在 Postgres 9.3 中有以下 event
table:
CREATE TABLE event (
event_id integer PRIMARY KEY,
user_id integer,
event_type varchar,
event_time timestamptz
);
我的目标 是检索所有 user_id
的事件之间至少间隔 30 天(或上次事件与当前事件之间)时间)。另一个复杂的问题是,我只希望有这些差距之一的用户出现在比他们执行某个 event_type
'convert'
更晚的时间。这怎么能轻松做到?
event
table 中的一些示例数据可能如下所示:
INSERT INTO event (event_id, user_id, event_type, event_time)
VALUES
(10, 1, 'signIn', '2015-05-05 00:11'),
(11, 1, 'browse', '2015-05-05 00:12'), -- no 'convert' event
(20, 2, 'signIn', '2015-06-07 02:35'),
(21, 2, 'browse', '2015-06-07 02:35'),
(22, 2, 'convert', '2015-06-07 02:36'), -- only 'convert' event
(23, 2, 'signIn', '2015-08-10 11:00'), -- gap of >= 30 days
(24, 2, 'signIn', '2015-08-11 11:00'),
(30, 3, 'convert', '2015-08-07 02:36'), -- starting with 1st 'convert' event
(31, 3, 'signIn', '2015-08-07 02:36'),
(32, 3, 'convert', '2015-08-08 02:36'),
(33, 3, 'signIn', '2015-08-12 11:00'), -- all gaps below 30 days
(33, 3, 'browse', '2015-08-12 11:00'), -- gap until today (2015-08-20) too small
(40, 4, 'convert', '2015-05-07 02:36'),
(41, 4, 'signIn', '2015-05-12 11:00'); -- gap until today (2015-08-20) >= 30 days
预期结果:
user_id
--------
2
4
一种方法:
SELECT user_id
FROM (
SELECT user_id
, lead(e.event_time, 1, now()) OVER (PARTITION BY e.user_id ORDER BY e.event_time)
- event_time AS gap
FROM ( -- only users with 'convert' event
SELECT user_id, min(event_time) AS first_time
FROM event
WHERE event_type = 'convert'
GROUP BY 1
) e1
JOIN event e USING (user_id)
WHERE e.event_time >= e1.first_time
) sub
WHERE gap >= interval '30 days'
GROUP BY 1;
如果没有"next row",window function lead()
允许包含一个默认值,这很方便满足您的额外要求"or between their last event and the current time"。
索引
如果你的 table 很大,你至少应该在 (user_id, event_time)
上有一个索引:
CREATE INDEX event_user_time_idx ON event(user_id, event_time);
如果您经常这样做并且 event_type
'convert' 很少见,请添加另一个部分索引:
CREATE INDEX event_user_time_convert_idx ON event(user_id, event_time)
WHERE event_type = 'convert';
对于每个用户许多 事件
并且只有 30 天的间隔 很常见(并非罕见情况)。
索引变得更加重要。
试试这个 recursive CTE 以获得更好的性能:
WITH RECURSIVE cte AS (
( -- parentheses required
SELECT DISTINCT ON (user_id)
user_id, event_time, interval '0 days' AS gap
FROM event
WHERE event_type = 'convert'
ORDER BY user_id, event_time
)
UNION ALL
SELECT c.user_id, e.event_time, COALESCE(e.event_time, now()) - c.event_time
FROM cte c
LEFT JOIN LATERAL (
SELECT e.event_time
FROM event e
WHERE e.user_id = c.user_id
AND e.event_time > c.event_time
ORDER BY e.event_time
LIMIT 1 -- the next later event
) e ON true -- add 1 row after last to consider gap till "now"
WHERE c.event_time IS NOT NULL
AND c.gap < interval '30 days'
)
SELECT * FROM cte
WHERE gap >= interval '30 days';
它有更多的开销,但可以停止 - 每个用户 - 在第一个足够大的差距。如果那应该是最后一个事件现在之间的差距,那么结果中的event_time
是NULL。
新 SQL Fiddle 具有更多揭示两个查询的测试数据。
这些相关答案中的详细解释:
- Optimize GROUP BY query to retrieve latest record per user
- Select first row in each GROUP BY group?
这是另一种方式,可能不像@Erwin 那样整洁,但所有步骤都分开了,因此很容易适应。
- include_today: 添加一个虚拟事件来指示当前日期。
- event_convert:为每个
user_id
计算事件convert
第一次出现的时间(本例中只有user_id = 2222
)
- event_row:为每个事件分配一个唯一的连续id。每个
user_id
从 1 个开始
- 最后一部分连接在一起并使用
rnum = rnum + 1
所以可以计算日期差异。
- 结果还显示两个事件都在
30 days
范围内,因此您可以查看这是否是您想要的结果。
.
WITH include_today as (
(SELECT 'xxxx' event_id, user_id, 'today' event_type, current_date as event_time
FROM users)
UNION
(SELECT *
FROM event)
),
event_convert as (
SELECT user_id, MIN(event_time) min_time
FROM event
WHERE event_type = 'convert'
GROUP BY user_id
),
event_row as (
SELECT *, row_number() OVER (PARTITION BY user_id ORDER BY event_time desc) as rnum
FROM
include_today
)
SELECT
A.user_id,
A.event_id eventA,
A.event_type typeA,
A.event_time timeA,
B.event_id eventB,
B.event_type typeB,
B.event_time timeB,
(B.event_time - A.event_time) days
FROM
event_convert e
Inner Join event_row A
ON e.user_id = A.user_id and e.min_time <= a. event_time
Inner Join event_row B
ON A.rnum = B.rnum + 1
AND A.user_id = B.user_id
WHERE
(B.event_time - A.event_time) > interval '30 days'
ORDER BY 1,4
我在 Postgres 9.3 中有以下 event
table:
CREATE TABLE event (
event_id integer PRIMARY KEY,
user_id integer,
event_type varchar,
event_time timestamptz
);
我的目标 是检索所有 user_id
的事件之间至少间隔 30 天(或上次事件与当前事件之间)时间)。另一个复杂的问题是,我只希望有这些差距之一的用户出现在比他们执行某个 event_type
'convert'
更晚的时间。这怎么能轻松做到?
event
table 中的一些示例数据可能如下所示:
INSERT INTO event (event_id, user_id, event_type, event_time)
VALUES
(10, 1, 'signIn', '2015-05-05 00:11'),
(11, 1, 'browse', '2015-05-05 00:12'), -- no 'convert' event
(20, 2, 'signIn', '2015-06-07 02:35'),
(21, 2, 'browse', '2015-06-07 02:35'),
(22, 2, 'convert', '2015-06-07 02:36'), -- only 'convert' event
(23, 2, 'signIn', '2015-08-10 11:00'), -- gap of >= 30 days
(24, 2, 'signIn', '2015-08-11 11:00'),
(30, 3, 'convert', '2015-08-07 02:36'), -- starting with 1st 'convert' event
(31, 3, 'signIn', '2015-08-07 02:36'),
(32, 3, 'convert', '2015-08-08 02:36'),
(33, 3, 'signIn', '2015-08-12 11:00'), -- all gaps below 30 days
(33, 3, 'browse', '2015-08-12 11:00'), -- gap until today (2015-08-20) too small
(40, 4, 'convert', '2015-05-07 02:36'),
(41, 4, 'signIn', '2015-05-12 11:00'); -- gap until today (2015-08-20) >= 30 days
预期结果:
user_id
--------
2
4
一种方法:
SELECT user_id
FROM (
SELECT user_id
, lead(e.event_time, 1, now()) OVER (PARTITION BY e.user_id ORDER BY e.event_time)
- event_time AS gap
FROM ( -- only users with 'convert' event
SELECT user_id, min(event_time) AS first_time
FROM event
WHERE event_type = 'convert'
GROUP BY 1
) e1
JOIN event e USING (user_id)
WHERE e.event_time >= e1.first_time
) sub
WHERE gap >= interval '30 days'
GROUP BY 1;
如果没有"next row",window function lead()
允许包含一个默认值,这很方便满足您的额外要求"or between their last event and the current time"。
索引
如果你的 table 很大,你至少应该在 (user_id, event_time)
上有一个索引:
CREATE INDEX event_user_time_idx ON event(user_id, event_time);
如果您经常这样做并且 event_type
'convert' 很少见,请添加另一个部分索引:
CREATE INDEX event_user_time_convert_idx ON event(user_id, event_time)
WHERE event_type = 'convert';
对于每个用户许多 事件
并且只有 30 天的间隔 很常见(并非罕见情况)。
索引变得更加重要。
试试这个 recursive CTE 以获得更好的性能:
WITH RECURSIVE cte AS (
( -- parentheses required
SELECT DISTINCT ON (user_id)
user_id, event_time, interval '0 days' AS gap
FROM event
WHERE event_type = 'convert'
ORDER BY user_id, event_time
)
UNION ALL
SELECT c.user_id, e.event_time, COALESCE(e.event_time, now()) - c.event_time
FROM cte c
LEFT JOIN LATERAL (
SELECT e.event_time
FROM event e
WHERE e.user_id = c.user_id
AND e.event_time > c.event_time
ORDER BY e.event_time
LIMIT 1 -- the next later event
) e ON true -- add 1 row after last to consider gap till "now"
WHERE c.event_time IS NOT NULL
AND c.gap < interval '30 days'
)
SELECT * FROM cte
WHERE gap >= interval '30 days';
它有更多的开销,但可以停止 - 每个用户 - 在第一个足够大的差距。如果那应该是最后一个事件现在之间的差距,那么结果中的event_time
是NULL。
新 SQL Fiddle 具有更多揭示两个查询的测试数据。
这些相关答案中的详细解释:
- Optimize GROUP BY query to retrieve latest record per user
- Select first row in each GROUP BY group?
这是另一种方式,可能不像@Erwin 那样整洁,但所有步骤都分开了,因此很容易适应。
- include_today: 添加一个虚拟事件来指示当前日期。
- event_convert:为每个
user_id
计算事件convert
第一次出现的时间(本例中只有user_id = 2222
) - event_row:为每个事件分配一个唯一的连续id。每个
user_id
从 1 个开始
- 最后一部分连接在一起并使用
rnum = rnum + 1
所以可以计算日期差异。 - 结果还显示两个事件都在
30 days
范围内,因此您可以查看这是否是您想要的结果。
.
WITH include_today as (
(SELECT 'xxxx' event_id, user_id, 'today' event_type, current_date as event_time
FROM users)
UNION
(SELECT *
FROM event)
),
event_convert as (
SELECT user_id, MIN(event_time) min_time
FROM event
WHERE event_type = 'convert'
GROUP BY user_id
),
event_row as (
SELECT *, row_number() OVER (PARTITION BY user_id ORDER BY event_time desc) as rnum
FROM
include_today
)
SELECT
A.user_id,
A.event_id eventA,
A.event_type typeA,
A.event_time timeA,
B.event_id eventB,
B.event_type typeB,
B.event_time timeB,
(B.event_time - A.event_time) days
FROM
event_convert e
Inner Join event_row A
ON e.user_id = A.user_id and e.min_time <= a. event_time
Inner Join event_row B
ON A.rnum = B.rnum + 1
AND A.user_id = B.user_id
WHERE
(B.event_time - A.event_time) > interval '30 days'
ORDER BY 1,4