如何递归地找到两个表之间的相交地理

Question

我是运行 Postgres 9.6.1 和 PostGIS 2.3.0 r15146，并且有两个 table。
geographies 可能有 150,000,000 行，paths 可能有 10,000,000 行：

CREATE TABLE paths (id uuid NOT NULL, path path NOT NULL, PRIMARY KEY (id))
CREATE TABLE geographies (id uuid NOT NULL, geography geography NOT NULL, PRIMARY KEY (id))

给定 array/set 的 ids 对于 table geographies，找到所有相交路径和几何图形的 "best" 方法是什么？

换句话说，如果初始 geography 有对应的相交 path 我们还需要找到所有 other geographies 这path 相交。从那里，我们需要找到这些新发现的 geographies 相交的所有其他 paths，依此类推，直到我们找到所有可能的交集。

初始地理 ID（我们的输入）可能在 0 到 700 之间的任何地方。平均约为 40。
最小交叉点为 0，最大值约为 1000。平均可能约为 20，通常少于 100 个连接。

我已经创建了一个执行此操作的函数，但我对 PostGIS 中的 GIS 和一般的 Postgres 不熟悉。我已经发布 .

我觉得应该有比我想出的方法更多 eloquent 和更快的方法。

Answer 1

我认为在这里 post 我自己的解决方案会很好，即使它不是最优的。

这是我的想法（采纳了史蒂夫·钱伯斯的建议）：

CREATE OR REPLACE FUNCTION public.function_name(
    _fk_ids character varying[])
    RETURNS TABLE(id uuid, type character varying)
    LANGUAGE 'plpgsql'
    COST 100.0
    VOLATILE
    ROWS 1000.0
AS $function$

    DECLARE
        _pathLength bigint;
        _geographyLength bigint;

        _currentPathLength bigint;
        _currentGeographyLength bigint;
    BEGIN
        DROP TABLE IF EXISTS _pathIds;
        DROP TABLE IF EXISTS _geographyIds;
        CREATE TEMPORARY TABLE _pathIds (id UUID PRIMARY KEY);
        CREATE TEMPORARY TABLE _geographyIds (id UUID PRIMARY KEY);

        -- get all geographies in the specified _fk_ids
        INSERT INTO _geographyIds
            SELECT g.id
            FROM geographies g
            WHERE g.fk_id= ANY(_fk_ids);

        _pathLength := 0;
        _geographyLength := 0;
        _currentPathLength := 0;
        _currentGeographyLength := (SELECT COUNT(_geographyIds.id) FROM _geographyIds);
        -- _pathIds := ARRAY[]::uuid[];

        WHILE (_currentPathLength - _pathLength > 0) OR (_currentGeographyLength - _geographyLength > 0) LOOP
            _pathLength := (SELECT COUNT(_pathIds.id) FROM _pathIds);
            _geographyLength := (SELECT COUNT(_geographyIds.id) FROM _geographyIds);

            -- gets all paths that have paths that intersect the geographies that aren't in the current list of path ids

            INSERT INTO _pathIds 
                SELECT DISTINCT p.id
                    FROM paths p
                    JOIN geographies g ON ST_Intersects(g.geography, p.path)
                    WHERE
                        g.id IN (SELECT _geographyIds.id FROM _geographyIds) AND
                        p.id NOT IN (SELECT _pathIds.id from _pathIds);

            -- gets all geographies that have paths that intersect the paths that aren't in the current list of geography ids
            INSERT INTO _geographyIds 
                SELECT DISTINCT g.id
                    FROM geographies g
                    JOIN paths p ON ST_Intersects(g.geography, p.path)
                    WHERE
                        p.id IN (SELECT _pathIds.id FROM _pathIds) AND
                        g.id NOT IN (SELECT _geographyIds.id FROM _geographyIds);

            _currentPathLength := (SELECT COUNT(_pathIds.id) FROM _pathIds);
            _currentGeographyLength := (SELECT COUNT(_geographyIds.id) FROM _geographyIds);
        END LOOP;

        RETURN QUERY
            SELECT _geographyIds.id, 'geography' AS type FROM _geographyIds
            UNION ALL
            SELECT _pathIds.id, 'path' AS type FROM _pathIds;
    END;

$function$;

Answer 2

可以彻底简化。

设置

我建议您将列 paths.path 转换为数据类型 geography（或至少 geometry）。 path is a native Postgres type and does not play well with PostGIS functions and spatial indexes. You would have to cast path::geometry or path::geometry::geography (resulting in a LINESTRING internally) to make it work with PostGIS functions like ST_Intersects().

我的回答是基于这些改编的 tables:

CREATE TABLE paths (
   id uuid PRIMARY KEY
 , path <b>geography</b> NOT NULL
);

CREATE TABLE geographies (
   id uuid PRIMARY KEY
 , geography geography NOT NULL
 , fk_id text NOT NULL
);

对于两列，一切都适用于数据类型 geometry。 geography 通常更准确，但也更昂贵。使用哪个？ Read the PostGIS FAQ here.

解决方案 1：您的函数已优化

CREATE OR REPLACE FUNCTION public.function_name(_fk_ids text[])
  RETURNS TABLE(id uuid, type text)
  LANGUAGE plpgsql AS
$func$
DECLARE
   _row_ct int;
   _loop_ct int := 0;

BEGIN
   CREATE TEMP TABLE _geo ON COMMIT DROP AS  -- dropped at end of transaction
   SELECT DISTINCT ON (g.id) g.id, g.geography, _loop_ct AS loop_ct -- dupes possible?
   FROM   geographies g
   WHERE  g.fk_id = ANY(_fk_ids);

   GET DIAGNOSTICS _row_ct = ROW_COUNT;

   IF _row_ct = 0 THEN  -- no rows found, return empty result immediately
      RETURN;           -- exit function
   END IF;

   CREATE TEMP TABLE _path ON COMMIT DROP AS
   SELECT DISTINCT ON (p.id) p.id, p.path, _loop_ct AS loop_ct
   FROM   _geo  g
   JOIN   paths p ON ST_Intersects(g.geography, p.path);  -- no dupes yet

   GET DIAGNOSTICS _row_ct = ROW_COUNT;

   IF _row_ct = 0 THEN  -- no rows found, return _geo immediately
      RETURN QUERY SELECT g.id, text 'geo' FROM _geo g;
      RETURN;   
   END IF;

   ALTER TABLE _geo  ADD CONSTRAINT g_uni UNIQUE (id);  -- required for UPSERT
   ALTER TABLE _path ADD CONSTRAINT p_uni UNIQUE (id);

   LOOP
      _loop_ct := _loop_ct + 1;

      INSERT INTO _geo(id, geography, loop_ct)
      SELECT DISTINCT ON (g.id) g.id, g.geography, _loop_ct
      FROM   _paths      p
      JOIN   geographies g ON ST_Intersects(g.geography, p.path)
      WHERE  p.loop_ct = _loop_ct - 1   -- only use last round!
      ON     CONFLICT ON CONSTRAINT g_uni DO NOTHING;  -- eliminate new dupes

      EXIT WHEN NOT FOUND;

      INSERT INTO _path(id, path, loop_ct)
      SELECT DISTINCT ON (p.id) p.id, p.path, _loop_ct
      FROM   _geo  g
      JOIN   paths p ON ST_Intersects(g.geography, p.path)
      WHERE  g.loop_ct = _loop_ct - 1
      ON     CONFLICT ON CONSTRAINT p_uni DO NOTHING;

      EXIT WHEN NOT FOUND;
   END LOOP;

   RETURN QUERY
   SELECT g.id, text 'geo'  FROM _geo g
   UNION ALL
   SELECT p.id, text 'path' FROM _path p;
END
$func$;

致电：

SELECT * FROM public.function_name('{foo,bar}');

比你的速度快很多。

要点

您基于整个集合的查询，而不是仅对集合的最新添加。这在不需要的每个循环中变得越来越慢。我添加了一个循环计数器 (loop_ct) 以 避免冗余工作 。

确保在 geographies.geography 和 paths.path:

上有空间 GiST indexes

  CREATE INDEX geo_geo_gix ON geographies USING GIST (geography);
  CREATE INDEX paths_path_gix ON paths USING GIST (path);

从 Postgres 9.5 开始 index-only scans would be an option for GiST indexes. You might add id as second index column. The benefit depends on many factors, you'd have to test. However, there is no fitting operator GiST class for the uuid type. It would work with bigint after installing the extension btree_gist:

Multicolumn index on 3 fields with heterogenous data types
在g.fk_id上也有一个合适的索引。同样，如果您可以从中获得 index-only 扫描，(fk_id, id, geography) 上的多列索引可能会有所回报。默认 btree 索引，fk_id 必须是第一个索引列。尤其是如果您运行查询经常且很少更新 table 和 table 行比索引宽得多。
您可以在声明时初始化变量。重写后只需要一次。
ON COMMIT DROP 在事务结束时自动删除临时 tables。所以我明确删除了删除 tables 。但是，如果您在 same 事务中调用该函数两次，则会出现异常。在函数中，我会检查是否存在临时 table 并在这种情况下使用 TRUNCATE 。相关：
How to check if a table exists in a given schema
使用 GET DIAGNOSTICS 获取行计数，而不是运行为该计数设置另一个查询。
Count rows affected by DELETE

你需要GET DIAGNOSTICS。 CREATE TABLE 不设置 FOUND（如手册中所述）。

在填充 table 之后添加索引或 PK / UNIQUE 约束 会更快。而不是在我们真正需要它之前。

ON CONFLICT ... DO ... 是自 Postgres 9.5 以来更简单、更便宜的 UPSERT 方式。

How to UPSERT (MERGE, INSERT ... ON DUPLICATE UPDATE) in PostgreSQL?

对于命令的简单形式，您只需列出索引列或表达式（如 ON CONFLICT (id) DO ...）并让 Postgres 执行唯一索引推断以确定仲裁约束或索引。我后来通过直接提供约束进行了优化。但是为此我们需要一个实际的 constraint - 唯一索引是不够的。相应地固定。 Details in the manual here.

它可能帮助 ANALYZE 临时 table 手动帮助 Postgres 找到最佳查询计划。（但我认为您的情况不需要它。）

Are regular VACUUM ANALYZE still recommended under 9.1?

_geo_ct - _geographyLength > 0 是 _geo_ct > _geographyLength 的一种笨拙且更昂贵的表达方式。但现在已经完全消失了。

不要引用语言名称。就 LANGUAGE plpgsql.

你的函数参数对于fk_id的数组是varchar[]，但你后来评论说：

It is a bigint field that represents a geographic area (it's actually a precomputed s2cell id at level 15).

我不知道 s2cell 级别 15 的 id，但理想情况下你传递一个匹配数据类型的数组，或者如果这不是默认为 text[].

此外，由于您发表了评论：

There are always exactly 13 fk_ids passed in.

这似乎是 VARIADIC 函数参数的完美用例。所以你的函数定义是：

CREATE OR REPLACE FUNCTION public.function_name(_fk_ids <b>VARIADIC</b> text[]) ...

详情：

Pass multiple values in single parameter

解决方案 2：使用递归 CTE
的普通 SQL
很难将 rCTE 包裹在两个交替循环周围，但可以通过一些 SQL 技巧来实现：

WITH RECURSIVE cte AS ( SELECT g.id, g.geography::text, NULL::text AS path, text 'geo' AS type FROM geographies g WHERE g.fk_id = ANY($kf_ids) -- your input array here UNION SELECT p.id, g.geography::text, p.path::text , CASE WHEN p.path IS NULL THEN 'geo' ELSE 'path' END AS type FROM cte c LEFT JOIN paths p ON c.type = 'geo' AND ST_Intersects(c.geography::geography, p.path) LEFT JOIN geographies g ON c.type = 'path' AND ST_Intersects(g.geography, c.path::geography) WHERE (p.path IS NOT NULL OR g.geography IS NOT NULL) ) SELECT id, type FROM cte;

就这些了。
您需要与上述相同的索引。您可以将其包装到一个 SQL 函数中以供重复使用。

主要附加点

强制转换为 text 是必要的，因为 geography 类型不是“可散列的”（geometry 也是如此）。 (See this open PostGIS issue for details.) 通过转换为 text 来解决它。由于 (id, type) 行是唯一的，为此我们可以忽略 geography 列。转换回 geography 以进行连接。应该不会多花太多钱。

我们需要两个 LEFT JOIN 所以不排除行，因为在每次迭代中只有两个 table 中的一个可能贡献更多行。
最后的条件确保我们还没有完成：

WHERE (p.path IS NOT NULL OR g.geography IS NOT NULL)

这是有效的，因为重复的发现被排除在临时中级table。 The manual:

For UNION (but not UNION ALL), discard duplicate rows and rows that duplicate any previous result row. Include all remaining rows in the result of the recursive query, and also place them in a temporary intermediate table.

那么哪个更快？

rCTE 可能比用于小结果集的函数更快。 temp tables 和索引在他的功能意味着更多的开销。不过，对于大型结果集，函数可能更快。只有使用您的实际设置进行测试才能为您提供明确的答案。*

参见。

Answer 3

Sample plot and data from this script

它可以是带有聚合函数的纯关系。此实现使用一个 path table 和一个 point table。两者都是几何。与通用地理位置相比，使用和测试测试数据更容易，但适应起来应该很简单。

create table path (
    path_text text primary key,
    path geometry(linestring) not null
);
create table point (
   point_text text primary key,
   point geometry(point) not null
);

一种保持聚合函数状态的类型：

create type mpath_mpoint as (
    mpath geometry(multilinestring),
    mpoint geometry(multipoint)
);

状态构建函数：

create or replace function path_point_intersect (
    _i mpath_mpoint[], _e mpath_mpoint
) returns mpath_mpoint[] as $$

    with e as (select (e).mpath, (e).mpoint from (values (_e)) e (e)),
    i as (select mpath, mpoint from unnest(_i) i (mpath, mpoint))
    select array_agg((mpath, mpoint)::mpath_mpoint)
    from (
        select
            st_multi(st_union(i.mpoint, e.mpoint)) as mpoint,
            (
                select st_collect(gd)
                from (
                    select gd from st_dump(i.mpath) a (a, gd)
                    union all
                    select gd from st_dump(e.mpath) b (a, gd)
                ) s
            ) as mpath
        from i inner join e on st_intersects(i.mpoint, e.mpoint)

        union all
        select i.mpoint, i.mpath
        from i inner join e on not st_intersects(i.mpoint, e.mpoint)

        union all
        select e.mpoint, e.mpath
        from e
        where not exists (
            select 1 from i
            where st_intersects(i.mpoint, e.mpoint)
        )
    ) s;
$$ language sql;

合计：

create aggregate path_point_agg (mpath_mpoint) (
    sfunc = path_point_intersect,
    stype = mpath_mpoint[]
);

此查询将 return 一组 multilinestring, multipoint 个字符串，其中包含匹配的 paths/points:

select st_astext(mpath), st_astext(mpoint)
from unnest((
    select path_point_agg((st_multi(path), st_multi(mpoint))::mpath_mpoint)
    from (
        select path, st_union(point) as mpoint
        from
            path 
            inner join
            point on st_intersects(path, point)
        group by path
    ) s
)) m (mpath, mpoint)
;
                         st_astext                         |          st_astext          
-----------------------------------------------------------+-----------------------------
 MULTILINESTRING((-10 0,10 0,8 3),(0 -10,0 10),(2 1,4 -1)) | MULTIPOINT(0 0,0 5,3 0,5 0)
 MULTILINESTRING((-9 -8,4 -8),(-8 -9,-8 6))                | MULTIPOINT(-8 -8,2 -8)
 MULTILINESTRING((-7 -4,-3 4,-5 6))                        | MULTIPOINT(-6 -2)

如何递归地找到两个表之间的相交地理

How to find intersecting geographies between two tables recursively

sql

postgresql

gis

postgis

sqlgeography

设置

解决方案 1：您的函数已优化

要点

解决方案 2：使用递归 CTE

主要附加点

那么哪个更快？