查找连续值长度小于阈值的记录
Find records where length of consecutive values is less than threshold
这是table
timestamp | tracker_id | position
-------------------------------+------------+----------
2020-02-01 16:23:45.571429+00 | 15 | 1
2020-02-01 16:23:45.857143+00 | 11 | 1
2020-02-01 16:23:46.428571+00 | 15 | 1
2020-02-01 16:23:46.714286+00 | 11 | 2
2020-02-01 16:23:54.714288+00 | 15 | 2
2020-02-01 16:23:55+00 | 15 | 1
2020-02-01 16:23:55.285714+00 | 11 | 1
2020-02-01 16:23:55.571429+00 | 15 | 1
2020-02-01 16:23:55.857143+00 | 15 | 1
2020-02-01 16:23:56.428571+00 | 11 | 1
2020-02-01 16:23:56.714286+00 | 15 | 1
2020-02-01 16:23:57+00 | 11 | 2
2020-02-01 16:23:58.142857+00 | 11 | 2
2020-02-01 16:23:58.428571+00 | 15 | 1
2020-02-01 16:23:58.714286+00 | 11 | 2
2020-02-01 16:23:59+00 | 11 | 1
2020-02-01 16:23:59.285714+00 | 15 | 1
2020-02-01 16:23:59.295714+00 | 10 | 1
2020-02-01 16:23:59.305714+00 | 10 | 2
2020-02-01 16:23:59.385714+00 | 10 | 2
2020-02-01 16:23:59.485714+00 | 10 | 3
Threshold
= 3
这里,
position
of tracker_id
:15
由 1 -> 1 -> 2 -> 1 -> 1 -> 1 -> 2 -> 2 -> 1
position
of tracker_id
:11
从 1 -> 2 -> 1 -> 1 -> 2 -> 2 -> 2 -> 1
变化
position
of tracker_id
:10
从 1 -> 2 -> 2 -> 3
更改
对于tracker_id
:15
1
之间连续2
的最大长度为<
threshold
对于tracker_id
:11
1
之间连续2
的最大长度为=
threshold
对于tracker_id
:10
连续 2
未包含在 1
之间
输出应该是tracker_id
:15
因为position
中连续2
的长度是
小于 threshold
如何使用查询来做到这一点?
这是一个缺口和孤岛问题。
您可以从使用行号之间的差异构建相邻记录组开始。然后,您可以聚合每个组,并使用 lag 和 lead 来恢复周围组的位置。最后一步是应用过滤逻辑。
select tracker_id
from (
select
tracker_id,
position,
count(*) cnt,
lag(position) over(partition by tracker_id order by max(timestamp)) lag_position,
lead(position) over(partition by tracker_id order by max(timestamp)) lead_position
from (
select
t.*,
row_number() over(partition by tracker_id order by timestamp) rn1,
row_number() over(partition by tracker_id, position order by timestamp) rn2
from mytable t
) t
group by tracker_id, position, rn1 - rn2
) t
where
position = 2
and lag_position = 1
and lead_position = 1
group by tracker_id
having max(cnt) < 3
这个 demo on DB Fiddle 与您的样本数据产生:
| tracker_id |
| ---------: |
| 15 |
没有必要将其作为间隙和孤岛问题来处理。只需使用 window 函数:
select tracker_id
from (select t.*,
min(position) over (partition by tracker_id
order by timestamp
rows between 2 preceding and current row
) as min_pos_3,
max(position) over (partition by tracker_id
order by timestamp
rows between 2 preceding and current row
) as max_pos_3
from t
) t
group by tracker_id
having count(*) filter (where min_pos_3 = max_pos_3) = 0
这只是查看每个跟踪器超过三个 3 的最小值和最大值。它 returns 只有值总是不同的行。
我稍微修改了您的输入 table(添加了 tracker_id=9 用于测试)。
Window 函数可以解决这个问题,例如:row_number(), lead
select x.*
into #temp1
from
(
select ' 2020-02-01 16:23:45.571429+00 ' as time_stamp, 9 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:45.857143+00 ' as time_stamp, 9 as tracker_id, 3 as position UNION ALL
select ' 2020-02-01 16:23:46.428571+00 ' as time_stamp, 9 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:24:45.571429+00 ' as time_stamp, 9 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:25:45.857143+00 ' as time_stamp, 9 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:26:45.857143+00 ' as time_stamp, 9 as tracker_id, 3 as position UNION ALL
select ' 2020-02-01 16:27:45.857143+00 ' as time_stamp, 9 as tracker_id, 3 as position UNION ALL
select ' 2020-02-01 16:28:46.428571+00 ' as time_stamp, 9 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:45.571429+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:45.857143+00 ' as time_stamp, 11 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:46.428571+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:46.714286+00 ' as time_stamp, 11 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:54.714288+00 ' as time_stamp, 15 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:55+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:55.285714+00 ' as time_stamp, 11 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:55.571429+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:55.857143+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:56.428571+00 ' as time_stamp, 11 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:56.714286+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:57+00 ' as time_stamp, 11 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:58.142857+00 ' as time_stamp, 11 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:58.428571+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:58.714286+00 ' as time_stamp, 11 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:59+00 ' as time_stamp, 11 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:59.285714+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:59.295714+00 ' as time_stamp, 10 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:59.305714+00 ' as time_stamp, 10 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:59.385714+00 ' as time_stamp, 10 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:59.485714+00 ' as time_stamp, 10 as tracker_id, 3 as position) x
;
select
*,
ROW_NUMBER() OVER(PARTITION BY tracker_id ORDER BY time_stamp) tracker_id_rownumber,
case when position=1 then 1 else 0 end is_pos0_equals_1, --is current row position=1?
case when (LEAD(position, 1) OVER (PARTITION BY tracker_id ORDER BY time_stamp))=2 then 1 else 0 end is_pos1_equals_2, --is next row position=2?
case when (LEAD(position, 2) OVER (PARTITION BY tracker_id ORDER BY time_stamp))=2 then 1 else 0 end is_pos2_equals_2, --next next row..
case when (LEAD(position, 3) OVER (PARTITION BY tracker_id ORDER BY time_stamp))=2 then 1 else 0 end is_pos3_equals_2 --next next next row..
into #temp2
from #temp1
;
--leave only trackers with intervals of type {1, ... ,1}
select a.tracker_id, a.tracker_id_rownumber interval_start, min(b.tracker_id_rownumber) interval_end
into #temp3
from #temp2 a
inner join #temp2 b on (a.tracker_id=b.tracker_id and a.tracker_id_rownumber<b.tracker_id_rownumber)
where a.position=1 and b.position=1
group by a.tracker_id, a.tracker_id_rownumber
--check each 3-elements subset (are there any triples of consecutive '2'?) and mark triples of consecutive '2'
select a.*,b.tracker_id tracker_id_,
case when b.interval_end - b.interval_start>=4 then
case when (a.is_pos1_equals_2=1 and a.is_pos2_equals_2=1 and a.is_pos3_equals_2=1) then 0 else 1 end
else
1
end 'is_less_than_threshold'
into #temp4
from #temp2 a
inner join #temp3 b on a.tracker_id=b.tracker_id and a.tracker_id_rownumber between b.interval_start and b.interval_end-1
--output trackers
select a.tracker_id, min(a.is_less_than_threshold) is_ok
from #temp4 a
group by a.tracker_id
having min(a.is_less_than_threshold)=1
输出
tracker_id | is_ok
9 | 1
15 | 1
这是table
timestamp | tracker_id | position
-------------------------------+------------+----------
2020-02-01 16:23:45.571429+00 | 15 | 1
2020-02-01 16:23:45.857143+00 | 11 | 1
2020-02-01 16:23:46.428571+00 | 15 | 1
2020-02-01 16:23:46.714286+00 | 11 | 2
2020-02-01 16:23:54.714288+00 | 15 | 2
2020-02-01 16:23:55+00 | 15 | 1
2020-02-01 16:23:55.285714+00 | 11 | 1
2020-02-01 16:23:55.571429+00 | 15 | 1
2020-02-01 16:23:55.857143+00 | 15 | 1
2020-02-01 16:23:56.428571+00 | 11 | 1
2020-02-01 16:23:56.714286+00 | 15 | 1
2020-02-01 16:23:57+00 | 11 | 2
2020-02-01 16:23:58.142857+00 | 11 | 2
2020-02-01 16:23:58.428571+00 | 15 | 1
2020-02-01 16:23:58.714286+00 | 11 | 2
2020-02-01 16:23:59+00 | 11 | 1
2020-02-01 16:23:59.285714+00 | 15 | 1
2020-02-01 16:23:59.295714+00 | 10 | 1
2020-02-01 16:23:59.305714+00 | 10 | 2
2020-02-01 16:23:59.385714+00 | 10 | 2
2020-02-01 16:23:59.485714+00 | 10 | 3
Threshold
= 3
这里,position
of tracker_id
:15
由 1 -> 1 -> 2 -> 1 -> 1 -> 1 -> 2 -> 2 -> 1
position
of tracker_id
:11
从 1 -> 2 -> 1 -> 1 -> 2 -> 2 -> 2 -> 1
position
of tracker_id
:10
从 1 -> 2 -> 2 -> 3
对于tracker_id
:15
1
之间连续2
的最大长度为<
threshold
对于tracker_id
:11
1
之间连续2
的最大长度为=
threshold
对于tracker_id
:10
连续 2
未包含在 1
输出应该是tracker_id
:15
因为position
中连续2
的长度是
小于 threshold
如何使用查询来做到这一点?
这是一个缺口和孤岛问题。
您可以从使用行号之间的差异构建相邻记录组开始。然后,您可以聚合每个组,并使用 lag 和 lead 来恢复周围组的位置。最后一步是应用过滤逻辑。
select tracker_id
from (
select
tracker_id,
position,
count(*) cnt,
lag(position) over(partition by tracker_id order by max(timestamp)) lag_position,
lead(position) over(partition by tracker_id order by max(timestamp)) lead_position
from (
select
t.*,
row_number() over(partition by tracker_id order by timestamp) rn1,
row_number() over(partition by tracker_id, position order by timestamp) rn2
from mytable t
) t
group by tracker_id, position, rn1 - rn2
) t
where
position = 2
and lag_position = 1
and lead_position = 1
group by tracker_id
having max(cnt) < 3
这个 demo on DB Fiddle 与您的样本数据产生:
| tracker_id | | ---------: | | 15 |
没有必要将其作为间隙和孤岛问题来处理。只需使用 window 函数:
select tracker_id
from (select t.*,
min(position) over (partition by tracker_id
order by timestamp
rows between 2 preceding and current row
) as min_pos_3,
max(position) over (partition by tracker_id
order by timestamp
rows between 2 preceding and current row
) as max_pos_3
from t
) t
group by tracker_id
having count(*) filter (where min_pos_3 = max_pos_3) = 0
这只是查看每个跟踪器超过三个 3 的最小值和最大值。它 returns 只有值总是不同的行。
我稍微修改了您的输入 table(添加了 tracker_id=9 用于测试)。
Window 函数可以解决这个问题,例如:row_number(), lead
select x.*
into #temp1
from
(
select ' 2020-02-01 16:23:45.571429+00 ' as time_stamp, 9 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:45.857143+00 ' as time_stamp, 9 as tracker_id, 3 as position UNION ALL
select ' 2020-02-01 16:23:46.428571+00 ' as time_stamp, 9 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:24:45.571429+00 ' as time_stamp, 9 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:25:45.857143+00 ' as time_stamp, 9 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:26:45.857143+00 ' as time_stamp, 9 as tracker_id, 3 as position UNION ALL
select ' 2020-02-01 16:27:45.857143+00 ' as time_stamp, 9 as tracker_id, 3 as position UNION ALL
select ' 2020-02-01 16:28:46.428571+00 ' as time_stamp, 9 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:45.571429+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:45.857143+00 ' as time_stamp, 11 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:46.428571+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:46.714286+00 ' as time_stamp, 11 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:54.714288+00 ' as time_stamp, 15 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:55+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:55.285714+00 ' as time_stamp, 11 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:55.571429+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:55.857143+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:56.428571+00 ' as time_stamp, 11 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:56.714286+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:57+00 ' as time_stamp, 11 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:58.142857+00 ' as time_stamp, 11 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:58.428571+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:58.714286+00 ' as time_stamp, 11 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:59+00 ' as time_stamp, 11 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:59.285714+00 ' as time_stamp, 15 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:59.295714+00 ' as time_stamp, 10 as tracker_id, 1 as position UNION ALL
select ' 2020-02-01 16:23:59.305714+00 ' as time_stamp, 10 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:59.385714+00 ' as time_stamp, 10 as tracker_id, 2 as position UNION ALL
select ' 2020-02-01 16:23:59.485714+00 ' as time_stamp, 10 as tracker_id, 3 as position) x
;
select
*,
ROW_NUMBER() OVER(PARTITION BY tracker_id ORDER BY time_stamp) tracker_id_rownumber,
case when position=1 then 1 else 0 end is_pos0_equals_1, --is current row position=1?
case when (LEAD(position, 1) OVER (PARTITION BY tracker_id ORDER BY time_stamp))=2 then 1 else 0 end is_pos1_equals_2, --is next row position=2?
case when (LEAD(position, 2) OVER (PARTITION BY tracker_id ORDER BY time_stamp))=2 then 1 else 0 end is_pos2_equals_2, --next next row..
case when (LEAD(position, 3) OVER (PARTITION BY tracker_id ORDER BY time_stamp))=2 then 1 else 0 end is_pos3_equals_2 --next next next row..
into #temp2
from #temp1
;
--leave only trackers with intervals of type {1, ... ,1}
select a.tracker_id, a.tracker_id_rownumber interval_start, min(b.tracker_id_rownumber) interval_end
into #temp3
from #temp2 a
inner join #temp2 b on (a.tracker_id=b.tracker_id and a.tracker_id_rownumber<b.tracker_id_rownumber)
where a.position=1 and b.position=1
group by a.tracker_id, a.tracker_id_rownumber
--check each 3-elements subset (are there any triples of consecutive '2'?) and mark triples of consecutive '2'
select a.*,b.tracker_id tracker_id_,
case when b.interval_end - b.interval_start>=4 then
case when (a.is_pos1_equals_2=1 and a.is_pos2_equals_2=1 and a.is_pos3_equals_2=1) then 0 else 1 end
else
1
end 'is_less_than_threshold'
into #temp4
from #temp2 a
inner join #temp3 b on a.tracker_id=b.tracker_id and a.tracker_id_rownumber between b.interval_start and b.interval_end-1
--output trackers
select a.tracker_id, min(a.is_less_than_threshold) is_ok
from #temp4 a
group by a.tracker_id
having min(a.is_less_than_threshold)=1
输出
tracker_id | is_ok
9 | 1
15 | 1