Postgres:如何从范围外的时间戳中找到最近的 tsrange?
Postgres: How to find nearest tsrange from timestamp outside of ranges?
我正在为供应商提供的本地服务建模(在 Postgres 9.6.1 / postGIS 2.3.1 中):
create table supplier (
id serial primary key,
name text not null check (char_length(title) < 280),
type service_type,
duration interval,
...
geo_position geography(POINT,4326)
...
);
当 he/she 可供预订时,每个供应商都会保留一个包含时间段的日历:
create table timeslot (
id serial primary key,
supplier_id integer not null references supplier(id),
slot tstzrange not null,
constraint supplier_overlapping_timeslot_not_allowed
exclude using gist (supplier_id with =, slot with &&)
);
当客户想知道附近有哪些供应商可以在特定时间预订时,我创建了一个视图和一个函数:
create view supplier_slots as
select
supplier.name, supplier.type, supplier.geo_position, supplier.duration, ...
timeslot.slot
from
supplier, timeslot
where
supplier.id = timeslot.supplier_id;
create function find_suppliers(wantedType service_type, near_latitude text, near_longitude text, at_time timestamptz)
returns setof supplier_slots as $$
declare
nearpoint geography;
begin
nearpoint := ST_GeographyFromText('SRID=4326;POINT(' || near_latitude || ' ' || near_longitude || ')');
return query
select * from supplier_slots
where type = wantedType
and tstzrange(at_time, at_time + duration) <@ slot
order by ST_Distance( nearpoint, geo_position )
limit 100;
end;
$$ language plpgsql;
这一切都非常有效。
现在,对于在请求的时间没有可预订时段的供应商,我想在请求的[=]之前和之后找到他们的最接近个可用时段13=],也按距离排序。
这让我有点头晕,我找不到任何合适的运算符来给我最近的 tsrange。
关于最聪明的方法有什么想法吗?
解决方案取决于您想要的确切定义。
架构
我建议这些稍微修改过的 table 定义可以使任务更简单、加强完整性并提高性能:
CREATE TABLE supplier (
supplier_id serial PRIMARY KEY,
supplier text NOT NULL CHECK (length(title) < 280),
type service_type,
duration interval,
geo_position geography(POINT,4326)
);
CREATE TABLE timeslot (
timeslot_id serial PRIMARY KEY,
supplier_id integer NOT NULL -- references supplier(id),
slot_a timestamptz NOT NULL,
slot_z timestamptz NOT NULL,
CONSTRAINT timeslot_range_valid CHECK (slot_a < slot_z)
CONSTRAINT timeslot_no_overlapping
EXCLUDE USING gist (supplier_id WITH =, tstzrange(slot_a, slot_z) WITH &&)
);
CREATE INDEX timeslot_slot_z ON timeslot (supplier_id, slot_z);
CREATE INDEX supplier_geo_position_gist ON supplier USING gist (geo_position);
保存两个 timestamptz
列 slot_a
和 slot_z
而不是 tstzrange
列 slot
- 并相应地调整约束。这现在将所有范围自动视为默认 inclusive 下限和 exclusive 上限 - 这避免了角落案例错误/头痛。
附带好处:2 timestamptz
只有 16 个字节,而不是 tstzrange
.
的 25 个字节(32 个带填充)
您可能在 slot
上遇到的所有问题都将继续使用 tstzrange(slot_a, slot_z)
作为替代品。
在 (supplier_id, slot_z)
上为手头的查询添加索引。
还有 supplier.geo_position
上的空间索引(您可能已经有了)。
根据 type
中的数据分布,查询中常见类型的几个部分索引可能有助于提高性能:
CREATE INDEX supplier_geo_type_foo_gist ON supplier USING gist (geo_position)
WHERE supplier = 'foo'::service_type;
查询/函数
此查询找到 X 个最接近的提供正确 service_type
的供应商(示例中为 100 个),每个供应商都有 一个 最接近的匹配时隙(由到时隙开始的时间距离定义)。我将其与实际匹配的插槽相结合,这可能是也可能不是您需要的。
CREATE FUNCTION f_suppliers_nearby(_type service_type, _lat text, _lon text, at_time timestamptz)
RETURNS TABLE (supplier_id int
, name text
, duration interval
, geo_position geography(POINT,4326)
, distance float
, timeslot_id int
, slot_a timestamptz
, slot_z timestamptz
, time_dist interval
) AS
$func$
WITH sup_nearby AS ( -- find matching or later slot
SELECT s.id, s.name, s.duration, s.geo_position
, ST_Distance(ST_GeographyFromText('SRID=4326;POINT(' || _lat || ' ' || _lon || ')')
, geo_position) AS distance
, t.timeslot_id, t.slot_a, t.slot_z
, CASE WHEN t.slot_a IS NOT NULL
THEN GREATEST(t.slot_a - at_time, interval '0') END AS time_dist
FROM supplier s
LEFT JOIN LATERAL (
SELECT *
FROM timeslot
WHERE supplier_id = supplier_id
AND slot_z > at_time + s.duration -- excl. upper bound
ORDER BY slot_z
LIMIT 1
) t ON true
WHERE s.type = _type
ORDER BY s.distance
LIMIT 100
)
SELECT *
FROM (
SELECT DISTINCT ON (supplier_id) * -- 1 slot per supplier
FROM (
TABLE sup_nearby -- matching or later slot
UNION ALL -- earlier slot
SELECT s.id, s.name, s.duration, s.geo_position
, s.distance
, t.timeslot_id, t.slot_a, t.slot_z
, GREATEST(at_time - t.slot_a, interval '0') AS time_dist
FROM sup_nearby s
CROSS JOIN LATERAL ( -- this time CROSS JOIN!
SELECT *
FROM timeslot
WHERE supplier_id = s.supplier_id
AND slot_z <= at_time -- excl. upper bound
ORDER BY slot_z DESC
LIMIT 1
) t
WHERE s.time_dist IS DISTINCT FROM interval '0' -- exact matches are done
) sub
ORDER BY supplier_id, time_dist -- pick temporally closest slot per supplier
) sub
ORDER BY time_dist, distance; -- matches first, ordered by distance; then misses, ordered by time distance
$func$ LANGUAGE sql;
我没有使用您的视图 supplier_slots
而是针对性能进行了优化。视图可能仍然很方便。您可以包括 tstzrange(slot_a, slot_z) AS slot
以实现向后兼容性。
查找最接近的 100 个供应商的基本查询是教科书 "K Nearest Neighbour" 问题。 GiST 索引对此很有效。相关:
附加任务(找到时间上最近的槽)可以分为两个任务:找到下一个更高的行和下一个下一个更低的行。该解决方案的核心特征是有两个子查询ORDER BY slot_z LIMIT 1
和ORDER BY slot_z DESC LIMIT 1
,这导致两个非常快索引扫描。
我将第一个与查找实际匹配相结合,这是一个(我认为是聪明的)优化,但可能会分散实际解决方案的注意力。
我正在为供应商提供的本地服务建模(在 Postgres 9.6.1 / postGIS 2.3.1 中):
create table supplier (
id serial primary key,
name text not null check (char_length(title) < 280),
type service_type,
duration interval,
...
geo_position geography(POINT,4326)
...
);
当 he/she 可供预订时,每个供应商都会保留一个包含时间段的日历:
create table timeslot (
id serial primary key,
supplier_id integer not null references supplier(id),
slot tstzrange not null,
constraint supplier_overlapping_timeslot_not_allowed
exclude using gist (supplier_id with =, slot with &&)
);
当客户想知道附近有哪些供应商可以在特定时间预订时,我创建了一个视图和一个函数:
create view supplier_slots as
select
supplier.name, supplier.type, supplier.geo_position, supplier.duration, ...
timeslot.slot
from
supplier, timeslot
where
supplier.id = timeslot.supplier_id;
create function find_suppliers(wantedType service_type, near_latitude text, near_longitude text, at_time timestamptz)
returns setof supplier_slots as $$
declare
nearpoint geography;
begin
nearpoint := ST_GeographyFromText('SRID=4326;POINT(' || near_latitude || ' ' || near_longitude || ')');
return query
select * from supplier_slots
where type = wantedType
and tstzrange(at_time, at_time + duration) <@ slot
order by ST_Distance( nearpoint, geo_position )
limit 100;
end;
$$ language plpgsql;
这一切都非常有效。
现在,对于在请求的时间没有可预订时段的供应商,我想在请求的[=]之前和之后找到他们的最接近个可用时段13=],也按距离排序。
这让我有点头晕,我找不到任何合适的运算符来给我最近的 tsrange。
关于最聪明的方法有什么想法吗?
解决方案取决于您想要的确切定义。
架构
我建议这些稍微修改过的 table 定义可以使任务更简单、加强完整性并提高性能:
CREATE TABLE supplier (
supplier_id serial PRIMARY KEY,
supplier text NOT NULL CHECK (length(title) < 280),
type service_type,
duration interval,
geo_position geography(POINT,4326)
);
CREATE TABLE timeslot (
timeslot_id serial PRIMARY KEY,
supplier_id integer NOT NULL -- references supplier(id),
slot_a timestamptz NOT NULL,
slot_z timestamptz NOT NULL,
CONSTRAINT timeslot_range_valid CHECK (slot_a < slot_z)
CONSTRAINT timeslot_no_overlapping
EXCLUDE USING gist (supplier_id WITH =, tstzrange(slot_a, slot_z) WITH &&)
);
CREATE INDEX timeslot_slot_z ON timeslot (supplier_id, slot_z);
CREATE INDEX supplier_geo_position_gist ON supplier USING gist (geo_position);
保存两个
timestamptz
列slot_a
和slot_z
而不是tstzrange
列slot
- 并相应地调整约束。这现在将所有范围自动视为默认 inclusive 下限和 exclusive 上限 - 这避免了角落案例错误/头痛。附带好处:2
timestamptz
只有 16 个字节,而不是tstzrange
. 的 25 个字节(32 个带填充)
您可能在
slot
上遇到的所有问题都将继续使用tstzrange(slot_a, slot_z)
作为替代品。在
(supplier_id, slot_z)
上为手头的查询添加索引。
还有supplier.geo_position
上的空间索引(您可能已经有了)。根据
type
中的数据分布,查询中常见类型的几个部分索引可能有助于提高性能:CREATE INDEX supplier_geo_type_foo_gist ON supplier USING gist (geo_position) WHERE supplier = 'foo'::service_type;
查询/函数
此查询找到 X 个最接近的提供正确 service_type
的供应商(示例中为 100 个),每个供应商都有 一个 最接近的匹配时隙(由到时隙开始的时间距离定义)。我将其与实际匹配的插槽相结合,这可能是也可能不是您需要的。
CREATE FUNCTION f_suppliers_nearby(_type service_type, _lat text, _lon text, at_time timestamptz)
RETURNS TABLE (supplier_id int
, name text
, duration interval
, geo_position geography(POINT,4326)
, distance float
, timeslot_id int
, slot_a timestamptz
, slot_z timestamptz
, time_dist interval
) AS
$func$
WITH sup_nearby AS ( -- find matching or later slot
SELECT s.id, s.name, s.duration, s.geo_position
, ST_Distance(ST_GeographyFromText('SRID=4326;POINT(' || _lat || ' ' || _lon || ')')
, geo_position) AS distance
, t.timeslot_id, t.slot_a, t.slot_z
, CASE WHEN t.slot_a IS NOT NULL
THEN GREATEST(t.slot_a - at_time, interval '0') END AS time_dist
FROM supplier s
LEFT JOIN LATERAL (
SELECT *
FROM timeslot
WHERE supplier_id = supplier_id
AND slot_z > at_time + s.duration -- excl. upper bound
ORDER BY slot_z
LIMIT 1
) t ON true
WHERE s.type = _type
ORDER BY s.distance
LIMIT 100
)
SELECT *
FROM (
SELECT DISTINCT ON (supplier_id) * -- 1 slot per supplier
FROM (
TABLE sup_nearby -- matching or later slot
UNION ALL -- earlier slot
SELECT s.id, s.name, s.duration, s.geo_position
, s.distance
, t.timeslot_id, t.slot_a, t.slot_z
, GREATEST(at_time - t.slot_a, interval '0') AS time_dist
FROM sup_nearby s
CROSS JOIN LATERAL ( -- this time CROSS JOIN!
SELECT *
FROM timeslot
WHERE supplier_id = s.supplier_id
AND slot_z <= at_time -- excl. upper bound
ORDER BY slot_z DESC
LIMIT 1
) t
WHERE s.time_dist IS DISTINCT FROM interval '0' -- exact matches are done
) sub
ORDER BY supplier_id, time_dist -- pick temporally closest slot per supplier
) sub
ORDER BY time_dist, distance; -- matches first, ordered by distance; then misses, ordered by time distance
$func$ LANGUAGE sql;
我没有使用您的视图 supplier_slots
而是针对性能进行了优化。视图可能仍然很方便。您可以包括 tstzrange(slot_a, slot_z) AS slot
以实现向后兼容性。
查找最接近的 100 个供应商的基本查询是教科书 "K Nearest Neighbour" 问题。 GiST 索引对此很有效。相关:
附加任务(找到时间上最近的槽)可以分为两个任务:找到下一个更高的行和下一个下一个更低的行。该解决方案的核心特征是有两个子查询ORDER BY slot_z LIMIT 1
和ORDER BY slot_z DESC LIMIT 1
,这导致两个非常快索引扫描。
我将第一个与查找实际匹配相结合,这是一个(我认为是聪明的)优化,但可能会分散实际解决方案的注意力。