如何强制 PostgreSQL 在加入之前先查询小集以使查询更快?
How to Force PostgreSQL query small set first before join to make query faster?
假设我有一个像下面这样的table
create table recommendation_raw_v2
(
id uuid default gen_random_uuid() constraint recommendation_pkey primary key,
worker_id uuid not null,
company_id uuid not null,
job_id uuid not null,
obsolete boolean default false not null,
discipline varchar default ''::character varying not null,
weekly_pay_amount numeric(12, 4) default 0 not null,
geog geography(Point, 4326) not null
);
我想找到所有 job_id 具有某些条件并且它们在 (-118.2436849, 34.0522342)
500 公里半径范围内。我创建一个查询
select A.weekly_pay_amount,
, ST_Distance(c.x::geography, t.geog::geography)/1000 as distance
, A.job_id
from (
select id, job_id, matching_score, weekly_pay_amount, job_created_at
from recommendation_raw_v2
where worker_id='89b9d5c1-3862-4820-887c-0f1b266e6ce8'::uuid
and company_id='9fcf4081-4adb-4aaf-bf86-f4926de332ef'::uuid
and obsolete = false
and weekly_pay_amount >= 500
and discipline='Foo'
) as A
join recommendation_raw_v2 as t on A.id = t.id,
(SELECT ST_SetSRID(ST_MakePoint(-118.2436849, 34.0522342), 4326)) AS c(x)
where ST_DWithin(t.geog::geography, c.x::geography, 500 * 1000)
order by 1 DESC;
我分析了这个查询
QUERY PLAN
-------------------------------------------------------------------------------------
Sort (cost=1098.60..1098.61 rows=1 width=41) (actual time=182052.024..182053.516 rows=202 loops=1)
Sort Key: recommendation_raw_v2.matching_score DESC
Sort Method: quicksort Memory: 40kB
-> Hash Join (cost=1033.35..1098.59 rows=1 width=41) (actual time=1055.329..182050.122 rows=202 loops=1)
Hash Cond: (t.id = recommendation_raw_v2.id)
-> Index Scan using gist_geog on recommendation_raw_v2 t (cost=0.67..33.69 rows=2753 width=48) (actual time=564.032..181600.874 rows=1919272 loops=1)
Index Cond: (geog && _st_expand('0101000020E6100000DC018D88988F5DC0CA5D3A9CAF064140'::geography, '500000'::double precision))
Filter: st_dwithin(geog, '0101000020E6100000DC018D88988F5DC0CA5D3A9CAF064140'::geography, '500000'::double precision, true)
Rows Removed by Filter: 1041991
-> Hash (cost=1029.49..1029.49 rows=255 width=49) (actual time=31.253..32.173 rows=310 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 35kB
-> Index Scan using worker_id_obsolete_index on recommendation_raw_v2 (cost=0.56..1029.49 rows=255 width=49) (actual time=1.883..31.102 rows=310 loops=1)
Index Cond: ((worker_id = '89b9d5c1-3862-4820-887c-0f1b266e6ce8'::uuid) AND (obsolete = false))
Filter: ((weekly_pay_amount >= '500'::numeric) AND (company_id = '9fcf4081-4adb-4aaf-bf86-f4926de332ef'::uuid) AND ((discipline)::text = 'Foo'::text))
Rows Removed by Filter: 148
Planning Time: 16.259 ms
Execution Time: 182058.761 ms
我看到有两部分,先gist_geog
索引得到1919272行,worker_id_obsolete_index
索引提取255行。 PostgreSQL 会将 2 个部分哈希连接在一起以产生最终结果。
我的问题是,如果我能以某种方式强制 PostgreSQL 首先执行 worker_id_obsolete_index
,我的查询会更快吗?如果是,您有什么建议吗?
已更新:
- PostgreSQL 12.5
- PostgreGIS 3.0 USE_GEOS=1 USE_PROJ=1 USE_STATS=1
请原谅我过于简单化了你的问题,但既然你加入了一个 table 和它自己的一个子集,那么将所有内容放在一个查询中并让 planer 会不会更便宜决定做什么?
SELECT
job_id,weekly_pay_amount,
ST_Distance(geog,
ST_SetSRID(ST_MakePoint(-118.2436849,34.0522342),4326)::geography)/1000
FROM recommendation_raw_v2
WHERE
worker_id='89b9d5c1-3862-4820-887c-0f1b266e6ce8'::uuid AND
company_id='9fcf4081-4adb-4aaf-bf86-f4926de332ef'::uuid AND
obsolete = false AND
weekly_pay_amount >= 500 AND
discipline='Foo' AND
ST_DWithin(geog,
ST_SetSRID(ST_MakePoint(-118.2436849,34.0522342),4326)::geography,500*1000);
假设我有一个像下面这样的table
create table recommendation_raw_v2
(
id uuid default gen_random_uuid() constraint recommendation_pkey primary key,
worker_id uuid not null,
company_id uuid not null,
job_id uuid not null,
obsolete boolean default false not null,
discipline varchar default ''::character varying not null,
weekly_pay_amount numeric(12, 4) default 0 not null,
geog geography(Point, 4326) not null
);
我想找到所有 job_id 具有某些条件并且它们在 (-118.2436849, 34.0522342)
500 公里半径范围内。我创建一个查询
select A.weekly_pay_amount,
, ST_Distance(c.x::geography, t.geog::geography)/1000 as distance
, A.job_id
from (
select id, job_id, matching_score, weekly_pay_amount, job_created_at
from recommendation_raw_v2
where worker_id='89b9d5c1-3862-4820-887c-0f1b266e6ce8'::uuid
and company_id='9fcf4081-4adb-4aaf-bf86-f4926de332ef'::uuid
and obsolete = false
and weekly_pay_amount >= 500
and discipline='Foo'
) as A
join recommendation_raw_v2 as t on A.id = t.id,
(SELECT ST_SetSRID(ST_MakePoint(-118.2436849, 34.0522342), 4326)) AS c(x)
where ST_DWithin(t.geog::geography, c.x::geography, 500 * 1000)
order by 1 DESC;
我分析了这个查询
QUERY PLAN
-------------------------------------------------------------------------------------
Sort (cost=1098.60..1098.61 rows=1 width=41) (actual time=182052.024..182053.516 rows=202 loops=1)
Sort Key: recommendation_raw_v2.matching_score DESC
Sort Method: quicksort Memory: 40kB
-> Hash Join (cost=1033.35..1098.59 rows=1 width=41) (actual time=1055.329..182050.122 rows=202 loops=1)
Hash Cond: (t.id = recommendation_raw_v2.id)
-> Index Scan using gist_geog on recommendation_raw_v2 t (cost=0.67..33.69 rows=2753 width=48) (actual time=564.032..181600.874 rows=1919272 loops=1)
Index Cond: (geog && _st_expand('0101000020E6100000DC018D88988F5DC0CA5D3A9CAF064140'::geography, '500000'::double precision))
Filter: st_dwithin(geog, '0101000020E6100000DC018D88988F5DC0CA5D3A9CAF064140'::geography, '500000'::double precision, true)
Rows Removed by Filter: 1041991
-> Hash (cost=1029.49..1029.49 rows=255 width=49) (actual time=31.253..32.173 rows=310 loops=1)
Buckets: 1024 Batches: 1 Memory Usage: 35kB
-> Index Scan using worker_id_obsolete_index on recommendation_raw_v2 (cost=0.56..1029.49 rows=255 width=49) (actual time=1.883..31.102 rows=310 loops=1)
Index Cond: ((worker_id = '89b9d5c1-3862-4820-887c-0f1b266e6ce8'::uuid) AND (obsolete = false))
Filter: ((weekly_pay_amount >= '500'::numeric) AND (company_id = '9fcf4081-4adb-4aaf-bf86-f4926de332ef'::uuid) AND ((discipline)::text = 'Foo'::text))
Rows Removed by Filter: 148
Planning Time: 16.259 ms
Execution Time: 182058.761 ms
我看到有两部分,先gist_geog
索引得到1919272行,worker_id_obsolete_index
索引提取255行。 PostgreSQL 会将 2 个部分哈希连接在一起以产生最终结果。
我的问题是,如果我能以某种方式强制 PostgreSQL 首先执行 worker_id_obsolete_index
,我的查询会更快吗?如果是,您有什么建议吗?
已更新:
- PostgreSQL 12.5
- PostgreGIS 3.0 USE_GEOS=1 USE_PROJ=1 USE_STATS=1
请原谅我过于简单化了你的问题,但既然你加入了一个 table 和它自己的一个子集,那么将所有内容放在一个查询中并让 planer 会不会更便宜决定做什么?
SELECT
job_id,weekly_pay_amount,
ST_Distance(geog,
ST_SetSRID(ST_MakePoint(-118.2436849,34.0522342),4326)::geography)/1000
FROM recommendation_raw_v2
WHERE
worker_id='89b9d5c1-3862-4820-887c-0f1b266e6ce8'::uuid AND
company_id='9fcf4081-4adb-4aaf-bf86-f4926de332ef'::uuid AND
obsolete = false AND
weekly_pay_amount >= 500 AND
discipline='Foo' AND
ST_DWithin(geog,
ST_SetSRID(ST_MakePoint(-118.2436849,34.0522342),4326)::geography,500*1000);