计算序列中上升值的边界
Calculating the boundaries of rising values in a sequence
我有连续 id
和波动压力值 druck
的记录。我的目标是找出压力增加发生在哪些值之间,并相应地获得该范围的最低和最高 id。我已经有一个使用经典 SQL 的解决方案,但我对更有效的解决方案感兴趣。
以下是一些典型条目:
create table produktion ( id int, druck numeric (5, 2 ) );
insert into produktion
values (1, 1.35), (2, 1.37), (3, 1.45), ( 4, 1.48), ( 5, 1.51), ( 6, 1.39),
(7, 1.53), (8, 1.55), (9, 1.62), (10, 1.39), (11, 1.32), (12, 1.28);
我期望这样的结果:
========================
| erste_id | letzte_id |
|=======================
| 1 | 5 |
| 6 | 9 |
========================
这是当前使用的查询:
SELECT p1.id AS erste_id, -- first id
p2.id AS letzte_id -- last id
FROM produktion AS p1,
produktion AS p2
WHERE p1.id < p2.id AND
NOT EXISTS( SELECT *
FROM produktion AS p3,
produktion AS p4
WHERE p3.druck <= p4.druck AND
p4.id = p3.id - 1 AND
p3.id BETWEEN p1.id + 1 AND p2.id OR
p3.id = p1.id - 1 AND p3.druck < p1.druck OR
p3.id = p2.id + 1 AND p3.druck > p2.druck )
更新
我忘了说在相等的值下,一个序列被认为是中断的。
更新 2
我对 FatFreddy 的查询做了一些小改动,使其符合我的要求。
WITH
find_boundaries AS (
SELECT id,
CASE WHEN lag( druck, 1, druck ) over ( ORDER BY id ) < druck AND
druck < lead( druck, 1, druck ) OVER ( ORDER BY id ) THEN NULL
WHEN lag( druck, 1, druck ) OVER ( ORDER BY id ) < druck AND
druck >= lead( druck, 1, druck ) OVER ( ORDER BY id ) THEN 'end_run'
WHEN lag( druck, 1, druck ) OVER ( ORDER BY id ) >= druck AND
druck < lead( druck, 1, druck ) OVER ( ORDER BY id ) THEN 'start_run'
END AS row_type
FROM produktion ),
start_boundary AS (
SELECT id,
row_number() OVER ( ORDER BY id) AS correlated_start_row
FROM find_boundaries
WHERE row_type = 'start_run' ),
end_boundary AS (
SELECT id,
row_number() OVER ( ORDER BY id ) AS correlated_end_row
FROM find_boundaries
WHERE row_type = 'end_run' )
SELECT s.id AS anfang,
e.id AS ende
FROM start_boundary AS s
JOIN end_boundary AS e
ON s.correlated_start_row = e.correlated_end_row
ORDER BY 1
对我提出的查询进行 EXPLAIN ANALYZE(12 个条目)
Planning time: 0.184 ms
Execution time: 3.525 ms
我可以给你一个计划时间更多但执行时间更少的查询
Planning time: 0.290 ms
Execution time: 0.269 ms
100 个条目
yours
Planning time: 0.193 ms
Execution time: 10457.269 ms
mine
Planning time: 0.342 ms
Execution time: 1.175 ms
1000 个条目
yours
no result after 5 minutes
mine
Planning time: 0.343 ms
Execution time: 5.866 ms
1000000 个条目
yours
no result after 5 minutes
mine
Planning time: 0.348 ms
Execution time: 5217.038 ms
用散列连接替换一些嵌套循环。
使用一些 window 函数
https://www.postgresql.org/docs/9.5/static/functions-window.html
我的
查询看起来像这样:
with temp_flow as
(
select
id,
case when
id = 1 and lead(produktion.druck,1, 0::numeric) over(order by id) <= produktion.druck then 'fall'
when
id = 1 and lead(produktion.druck,1, 0::numeric) over(order by id) > produktion.druck then 'start raise'
when
lag(produktion.druck,1, 0::numeric) over(order by id) < produktion.druck and lead(produktion.druck,1, 0::numeric) over(order by id) > produktion.druck then 'raise'
when
lag(produktion.druck,1, 0::numeric) over(order by id) < produktion.druck and lead(produktion.druck,1, 0::numeric) over(order by id) <= produktion.druck then 'end raise'
when
lag(produktion.druck,1, 0::numeric) over(order by id) = produktion.druck and lead(produktion.druck,1, 0::numeric) over(order by id) <= produktion.druck then 'fall'
when
lag(produktion.druck,1, 0::numeric) over(order by id) >= produktion.druck and lead(produktion.druck,1, 0::numeric) over(order by id) > produktion.druck then 'start raise'
else 'fall'
end as way,
lag(produktion.druck,1, 0::numeric) over(order by id) as beforelag,
produktion.druck,
lead(produktion.druck,1, 0::numeric) over(order by id) as afterlead
from produktion
order by id
),
temp_start as
(
select
id,
row_number() over () as xrow
from temp_flow
where temp_flow.way = 'start raise'
order by id
),
temp_end as
(
select
id,
row_number() over () as xrow
from temp_flow
where temp_flow.way = 'end raise'
order by id
)
select
temp_start.id as first,
temp_end.id as last
from
temp_start
join temp_end on temp_end.xrow = temp_start.xrow
在你的设置中没有声明用 equal druck 应该做什么,所以你可以在某个地方放一个 >= 而不是 >
你可以使用数字来代替,例如'start raise' 我用它来提高可读性,它应该比文本比较更快。
祝您在系统上使用大数据测试愉快。
编辑:正确的结果做了更多的案例,获取了一些奇怪的案例
我有连续 id
和波动压力值 druck
的记录。我的目标是找出压力增加发生在哪些值之间,并相应地获得该范围的最低和最高 id。我已经有一个使用经典 SQL 的解决方案,但我对更有效的解决方案感兴趣。
以下是一些典型条目:
create table produktion ( id int, druck numeric (5, 2 ) );
insert into produktion
values (1, 1.35), (2, 1.37), (3, 1.45), ( 4, 1.48), ( 5, 1.51), ( 6, 1.39),
(7, 1.53), (8, 1.55), (9, 1.62), (10, 1.39), (11, 1.32), (12, 1.28);
我期望这样的结果:
========================
| erste_id | letzte_id |
|=======================
| 1 | 5 |
| 6 | 9 |
========================
这是当前使用的查询:
SELECT p1.id AS erste_id, -- first id
p2.id AS letzte_id -- last id
FROM produktion AS p1,
produktion AS p2
WHERE p1.id < p2.id AND
NOT EXISTS( SELECT *
FROM produktion AS p3,
produktion AS p4
WHERE p3.druck <= p4.druck AND
p4.id = p3.id - 1 AND
p3.id BETWEEN p1.id + 1 AND p2.id OR
p3.id = p1.id - 1 AND p3.druck < p1.druck OR
p3.id = p2.id + 1 AND p3.druck > p2.druck )
更新
我忘了说在相等的值下,一个序列被认为是中断的。
更新 2
我对 FatFreddy 的查询做了一些小改动,使其符合我的要求。
WITH
find_boundaries AS (
SELECT id,
CASE WHEN lag( druck, 1, druck ) over ( ORDER BY id ) < druck AND
druck < lead( druck, 1, druck ) OVER ( ORDER BY id ) THEN NULL
WHEN lag( druck, 1, druck ) OVER ( ORDER BY id ) < druck AND
druck >= lead( druck, 1, druck ) OVER ( ORDER BY id ) THEN 'end_run'
WHEN lag( druck, 1, druck ) OVER ( ORDER BY id ) >= druck AND
druck < lead( druck, 1, druck ) OVER ( ORDER BY id ) THEN 'start_run'
END AS row_type
FROM produktion ),
start_boundary AS (
SELECT id,
row_number() OVER ( ORDER BY id) AS correlated_start_row
FROM find_boundaries
WHERE row_type = 'start_run' ),
end_boundary AS (
SELECT id,
row_number() OVER ( ORDER BY id ) AS correlated_end_row
FROM find_boundaries
WHERE row_type = 'end_run' )
SELECT s.id AS anfang,
e.id AS ende
FROM start_boundary AS s
JOIN end_boundary AS e
ON s.correlated_start_row = e.correlated_end_row
ORDER BY 1
对我提出的查询进行 EXPLAIN ANALYZE(12 个条目)
Planning time: 0.184 ms
Execution time: 3.525 ms
我可以给你一个计划时间更多但执行时间更少的查询
Planning time: 0.290 ms
Execution time: 0.269 ms
100 个条目
yours
Planning time: 0.193 ms
Execution time: 10457.269 ms
mine
Planning time: 0.342 ms
Execution time: 1.175 ms
1000 个条目
yours
no result after 5 minutes
mine
Planning time: 0.343 ms
Execution time: 5.866 ms
1000000 个条目
yours
no result after 5 minutes
mine
Planning time: 0.348 ms
Execution time: 5217.038 ms
用散列连接替换一些嵌套循环。 使用一些 window 函数 https://www.postgresql.org/docs/9.5/static/functions-window.html 我的 查询看起来像这样:
with temp_flow as
(
select
id,
case when
id = 1 and lead(produktion.druck,1, 0::numeric) over(order by id) <= produktion.druck then 'fall'
when
id = 1 and lead(produktion.druck,1, 0::numeric) over(order by id) > produktion.druck then 'start raise'
when
lag(produktion.druck,1, 0::numeric) over(order by id) < produktion.druck and lead(produktion.druck,1, 0::numeric) over(order by id) > produktion.druck then 'raise'
when
lag(produktion.druck,1, 0::numeric) over(order by id) < produktion.druck and lead(produktion.druck,1, 0::numeric) over(order by id) <= produktion.druck then 'end raise'
when
lag(produktion.druck,1, 0::numeric) over(order by id) = produktion.druck and lead(produktion.druck,1, 0::numeric) over(order by id) <= produktion.druck then 'fall'
when
lag(produktion.druck,1, 0::numeric) over(order by id) >= produktion.druck and lead(produktion.druck,1, 0::numeric) over(order by id) > produktion.druck then 'start raise'
else 'fall'
end as way,
lag(produktion.druck,1, 0::numeric) over(order by id) as beforelag,
produktion.druck,
lead(produktion.druck,1, 0::numeric) over(order by id) as afterlead
from produktion
order by id
),
temp_start as
(
select
id,
row_number() over () as xrow
from temp_flow
where temp_flow.way = 'start raise'
order by id
),
temp_end as
(
select
id,
row_number() over () as xrow
from temp_flow
where temp_flow.way = 'end raise'
order by id
)
select
temp_start.id as first,
temp_end.id as last
from
temp_start
join temp_end on temp_end.xrow = temp_start.xrow
在你的设置中没有声明用 equal druck 应该做什么,所以你可以在某个地方放一个 >= 而不是 > 你可以使用数字来代替,例如'start raise' 我用它来提高可读性,它应该比文本比较更快。
祝您在系统上使用大数据测试愉快。
编辑:正确的结果做了更多的案例,获取了一些奇怪的案例