如何使用 window 函数优化 SQL 查询
How to optimize SQL query with window functions
这个问题与一个有关。我有 table,其中包含设备的功率值,我需要计算给定时间跨度和 return 10 个最耗电设备的功耗。我生成了 192 个设备和 7742208 个测量记录(每个 40324 个)。这大概是设备在一个月内产生的记录量。
对于这个数据量,我当前的查询需要 40 多秒才能执行,这太多了,因为时间跨度和设备数量以及测量值可能要高得多。我是否应该尝试使用不同于 lag() OVER PARTITION 的方法来解决这个问题,以及可以进行哪些其他优化?我真的很感激代码示例的建议。
PostgreSQL 版本 9.4
使用示例值查询:
SELECT
t.device_id,
sum(len_y*(extract(epoch from len_x))) AS total_consumption
FROM (
SELECT
m.id,
m.device_id,
m.power_total,
m.created_at,
m.power_total+lag(m.power_total) OVER (
PARTITION BY device_id
ORDER BY m.created_at
) AS len_y,
m.created_at-lag(m.created_at) OVER (
PARTITION BY device_id
ORDER BY m.created_at
) AS len_x
FROM
measurements AS m
WHERE m.created_at BETWEEN '2015-07-30 13:05:24.403552+00'::timestamp
AND '2015-08-27 12:34:59.826837+00'::timestamp
) AS t
GROUP BY t.device_id
ORDER BY total_consumption
DESC LIMIT 10;
Table 信息:
Column | Type | Modifiers
--------------+--------------------------+----------------------------------------------------------
id | integer | not null default nextval('measurements_id_seq'::regclass)
created_at | timestamp with time zone | default timezone('utc'::text, now())
power_total | real |
device_id | integer | not null
Indexes:
"measurements_pkey" PRIMARY KEY, btree (id)
"measurements_device_id_idx" btree (device_id)
"measurements_created_at_idx" btree (created_at)
Foreign-key constraints:
"measurements_device_id_fkey" FOREIGN KEY (device_id) REFERENCES devices(id)
查询计划:
Limit (cost=1317403.25..1317403.27 rows=10 width=24) (actual time=41077.091..41077.094 rows=10 loops=1)
-> Sort (cost=1317403.25..1317403.73 rows=192 width=24) (actual time=41077.089..41077.092 rows=10 loops=1)
Sort Key: (sum((((m.power_total + lag(m.power_total) OVER (?))) * date_part('epoch'::text, ((m.created_at - lag(m.created_at) OVER (?)))))))
Sort Method: top-N heapsort Memory: 25kB
-> GroupAggregate (cost=1041700.67..1317399.10 rows=192 width=24) (actual time=25361.013..41076.562 rows=192 loops=1)
Group Key: m.device_id
-> WindowAgg (cost=1041700.67..1201314.44 rows=5804137 width=20) (actual time=25291.797..37839.727 rows=7742208 loops=1)
-> Sort (cost=1041700.67..1056211.02 rows=5804137 width=20) (actual time=25291.746..30699.993 rows=7742208 loops=1)
Sort Key: m.device_id, m.created_at
Sort Method: external merge Disk: 257344kB
-> Seq Scan on measurements m (cost=0.00..151582.05 rows=5804137 width=20) (actual time=0.333..5112.851 rows=7742208 loops=1)
Filter: ((created_at >= '2015-07-30 13:05:24.403552'::timestamp without time zone) AND (created_at <= '2015-08-27 12:34:59.826837'::timestamp without time zone))
Planning time: 0.351 ms
Execution time: 41114.883 ms
生成测试table和数据的查询:
CREATE TABLE measurements (
id serial primary key,
device_id integer,
power_total real,
created_at timestamp
);
INSERT INTO measurements(
device_id,
created_at,
power_total
)
SELECT
device_id,
now() + (i * interval '1 minute'),
random()*(50-1)+1
FROM (
SELECT
DISTINCT(device_id),
generate_series(0,10) AS i
FROM (
SELECT
generate_series(1,5) AS device_id
) AS dev_ids
) AS gen_table;
我认为你的问题不同。
我创建了 800 万行的示例数据(200 台设备,40000 次测量)
而且反应真的很快(2秒)
Postgres 9.3 - iCore 5 / 3.2 mhz / 8gb / sata Hdd / Windows 7
我还没有创建索引(你在设置脚本中错过了那部分)
我会尝试将部分计算移到行插入阶段。
添加新列:
alter table measurements add consumption real;
更新专栏:
with m1 as (
select
id, power_total, created_at,
lag(power_total) over (partition by device_id order by created_at) prev_power_total,
lag(created_at) over (partition by device_id order by created_at) prev_created_at
from measurements
)
update measurements m2
set consumption =
(m1.power_total+ m1.prev_power_total)*
extract(epoch from m1.created_at- m1.prev_created_at)
from m1
where m2.id = m1.id;
创建触发器:
create or replace function before_insert_on_measurements()
returns trigger language plpgsql
as $$
declare
rec record;
begin
select power_total, created_at into rec
from measurements
where device_id = new.device_id
order by created_at desc
limit 1;
new.consumption:=
(new.power_total+ rec.power_total)*
extract(epoch from new.created_at- rec.created_at);
return new;
end $$;
create trigger before_insert_on_measurements
before insert on measurements
for each row execute procedure before_insert_on_measurements();
查询:
select device_id, sum(consumption) total_consumption
from measurements
-- where conditions
group by 1
order by 1
这个问题与
对于这个数据量,我当前的查询需要 40 多秒才能执行,这太多了,因为时间跨度和设备数量以及测量值可能要高得多。我是否应该尝试使用不同于 lag() OVER PARTITION 的方法来解决这个问题,以及可以进行哪些其他优化?我真的很感激代码示例的建议。
PostgreSQL 版本 9.4
使用示例值查询:
SELECT
t.device_id,
sum(len_y*(extract(epoch from len_x))) AS total_consumption
FROM (
SELECT
m.id,
m.device_id,
m.power_total,
m.created_at,
m.power_total+lag(m.power_total) OVER (
PARTITION BY device_id
ORDER BY m.created_at
) AS len_y,
m.created_at-lag(m.created_at) OVER (
PARTITION BY device_id
ORDER BY m.created_at
) AS len_x
FROM
measurements AS m
WHERE m.created_at BETWEEN '2015-07-30 13:05:24.403552+00'::timestamp
AND '2015-08-27 12:34:59.826837+00'::timestamp
) AS t
GROUP BY t.device_id
ORDER BY total_consumption
DESC LIMIT 10;
Table 信息:
Column | Type | Modifiers
--------------+--------------------------+----------------------------------------------------------
id | integer | not null default nextval('measurements_id_seq'::regclass)
created_at | timestamp with time zone | default timezone('utc'::text, now())
power_total | real |
device_id | integer | not null
Indexes:
"measurements_pkey" PRIMARY KEY, btree (id)
"measurements_device_id_idx" btree (device_id)
"measurements_created_at_idx" btree (created_at)
Foreign-key constraints:
"measurements_device_id_fkey" FOREIGN KEY (device_id) REFERENCES devices(id)
查询计划:
Limit (cost=1317403.25..1317403.27 rows=10 width=24) (actual time=41077.091..41077.094 rows=10 loops=1)
-> Sort (cost=1317403.25..1317403.73 rows=192 width=24) (actual time=41077.089..41077.092 rows=10 loops=1)
Sort Key: (sum((((m.power_total + lag(m.power_total) OVER (?))) * date_part('epoch'::text, ((m.created_at - lag(m.created_at) OVER (?)))))))
Sort Method: top-N heapsort Memory: 25kB
-> GroupAggregate (cost=1041700.67..1317399.10 rows=192 width=24) (actual time=25361.013..41076.562 rows=192 loops=1)
Group Key: m.device_id
-> WindowAgg (cost=1041700.67..1201314.44 rows=5804137 width=20) (actual time=25291.797..37839.727 rows=7742208 loops=1)
-> Sort (cost=1041700.67..1056211.02 rows=5804137 width=20) (actual time=25291.746..30699.993 rows=7742208 loops=1)
Sort Key: m.device_id, m.created_at
Sort Method: external merge Disk: 257344kB
-> Seq Scan on measurements m (cost=0.00..151582.05 rows=5804137 width=20) (actual time=0.333..5112.851 rows=7742208 loops=1)
Filter: ((created_at >= '2015-07-30 13:05:24.403552'::timestamp without time zone) AND (created_at <= '2015-08-27 12:34:59.826837'::timestamp without time zone))
Planning time: 0.351 ms
Execution time: 41114.883 ms
生成测试table和数据的查询:
CREATE TABLE measurements (
id serial primary key,
device_id integer,
power_total real,
created_at timestamp
);
INSERT INTO measurements(
device_id,
created_at,
power_total
)
SELECT
device_id,
now() + (i * interval '1 minute'),
random()*(50-1)+1
FROM (
SELECT
DISTINCT(device_id),
generate_series(0,10) AS i
FROM (
SELECT
generate_series(1,5) AS device_id
) AS dev_ids
) AS gen_table;
我认为你的问题不同。
我创建了 800 万行的示例数据(200 台设备,40000 次测量)
而且反应真的很快(2秒)
Postgres 9.3 - iCore 5 / 3.2 mhz / 8gb / sata Hdd / Windows 7
我还没有创建索引(你在设置脚本中错过了那部分)
我会尝试将部分计算移到行插入阶段。
添加新列:
alter table measurements add consumption real;
更新专栏:
with m1 as (
select
id, power_total, created_at,
lag(power_total) over (partition by device_id order by created_at) prev_power_total,
lag(created_at) over (partition by device_id order by created_at) prev_created_at
from measurements
)
update measurements m2
set consumption =
(m1.power_total+ m1.prev_power_total)*
extract(epoch from m1.created_at- m1.prev_created_at)
from m1
where m2.id = m1.id;
创建触发器:
create or replace function before_insert_on_measurements()
returns trigger language plpgsql
as $$
declare
rec record;
begin
select power_total, created_at into rec
from measurements
where device_id = new.device_id
order by created_at desc
limit 1;
new.consumption:=
(new.power_total+ rec.power_total)*
extract(epoch from new.created_at- rec.created_at);
return new;
end $$;
create trigger before_insert_on_measurements
before insert on measurements
for each row execute procedure before_insert_on_measurements();
查询:
select device_id, sum(consumption) total_consumption
from measurements
-- where conditions
group by 1
order by 1