动态地将函数应用于 Postgres table 中的所有列

Question

使用 Postgres 13.1，我想对 table 的所有列应用前向填充函数。前向填充功能在我之前的问题中有解释：

但是，在那种情况下，指定了列和 table。我想获取该代码并将其应用于任意 table，即。指定 table 并且前向填充应用于每一列。

以这个table为例：

CREATE TABLE example(row_num int, id int, str text, val integer);
INSERT INTO example VALUES
  (1, 1, '1a', NULL)
, (2, 1, NULL,    1)
, (3, 2, '2a',    2)
, (4, 2, NULL, NULL)
, (5, 3, NULL, NULL)
, (6, 3, '3a',   31)
, (7, 3, NULL, NULL)
, (8, 3, NULL,   32)
, (9, 3, '3b', NULL)
, (10,3, NULL, NULL)
;

我从函数的以下工作基础开始。我称之为传递一些变量名。请注意，第一个是 table 名称而不是列名称。该函数采用 table 名称并创建所有列名称的数组，然后输出名称。

create or replace function col_collect(tbl text, id text, row_num text)
    returns text[]
    language plpgsql as
$func$
declare
    tmp text[];
    col text;
begin
    select array (
            select column_name
            from information_schema."columns" c
            where table_name = tbl
            ) into tmp;
    foreach col in array tmp
    loop
        raise notice 'col: %', col;
    end loop;
    return tmp;
end
$func$;

我想将我从之前的问题中得到的“前向填充”函数应用到 table 的每一列。 UPDATE 似乎是正确的方法。所以这是前面的函数，我用 execute 更新替换 raise notice 这样我就可以传入 table 名称：

create or replace function col_collect(tbl text, id text, row_num text)
    returns void
    language plpgsql as
$func$
declare
    tmp text[];
    col text;
begin
    select array (
            select column_name
            from information_schema."columns" c
            where table_name = tbl
            ) into tmp;
    foreach col in array tmp
    loop
        execute 'update '||tbl||' 
                set '||col||' = gapfill('||col||') OVER w AS '||col||' 
                where '||tbl||'.row_num = '||col||'.row_num
                window w as (PARTITION BY '||id||' ORDER BY '||row_num||') 
                returning *;';
    end loop;
end
$func$;

-- call the function
select col_collect('example','id','row_num')

前面的错误是语法错误。我已经尝试了很多变体，但都失败了。关于 SO 的有用答案是 here and 。我尝试应用的聚合函数（作为 window 函数）是：

CREATE OR REPLACE FUNCTION gap_fill_internal(s anyelement, v anyelement)
  RETURNS anyelement
  LANGUAGE plpgsql AS
$func$
BEGIN
RETURN COALESCE(v, s);  -- that's all!
END
$func$;

CREATE AGGREGATE gap_fill(anyelement) ( 
  SFUNC = gap_fill_internal, 
  STYPE = anyelement 
);

我的问题是：

这是一个好方法吗？如果是，我做错了什么；或
有更好的方法吗？

Answer 1

你问的可不是小事。您应该对 PL/pgSQL 感到满意。我不建议初学者使用这种动态 SQL 查询，太强大了。

话虽如此，让我们开始吧。系好安全带！

CREATE OR REPLACE FUNCTION f_gap_fill_update(_tbl regclass, _id text, _row_num text, OUT nullable_columns int, OUT updated_rows int)
  LANGUAGE plpgsql AS
$func$
DECLARE
   _pk  text  := quote_ident(_row_num);
   _sql text;
BEGIN   
   SELECT INTO _sql, nullable_columns
          concat_ws(E'\n'
          , 'UPDATE ' || _tbl || ' t'
          , 'SET   (' || string_agg(        quote_ident(a.attname), ', ') || ')'
          , '    = (' || string_agg('u.' || quote_ident(a.attname), ', ') || ')'
          , 'FROM  (' 
          , '   SELECT ' || _pk
          , '        , ' || string_agg(format('gap_fill(%1$I) OVER w AS %1$I', a.attname), ', ')
          , '   FROM   ' || _tbl
          , format('   WINDOW w AS (PARTITION BY %I ORDER BY %s)', _id, _pk)
          , '   ) u'
          , format('WHERE t.%1$s = u.%1$s', _pk)
          , 'AND  (' || string_agg('t.' || quote_ident(a.attname), ', ') || ') IS DISTINCT FROM'
          , '     (' || string_agg('u.' || quote_ident(a.attname), ', ') || ')'
          )
        , count(*) -- AS _col_ct
   FROM  (
      SELECT a.attname
      FROM   pg_attribute a
      WHERE  a.attrelid = _tbl
      AND    a.attnum > 0
      AND    NOT a.attisdropped
      AND    NOT a.attnotnull
      ORDER  BY a.attnum
      ) a;

   IF nullable_columns = 0 THEN
      RAISE EXCEPTION 'No nullable columns found in table >>%<<', _tbl;
   ELSIF _sql IS NULL THEN
      RAISE EXCEPTION 'SQL string is NULL. Should not occur!';
   END IF;
   
   -- RAISE NOTICE '%', _sql;       -- debug
   EXECUTE _sql;              -- execute
   GET DIAGNOSTICS updated_rows = ROW_COUNT; 
END
$func$;

调用示例：

SELECT * FROM f_gap_fill_update('example', 'id', 'row_num');

db<>fiddle here

该功能是最先进的。生成和执行以下形式的查询：

UPDATE tbl t
SET   (str, val, col1)
    = (u.str, u.val, u.col1)
FROM  (
   SELECT row_num
        , gap_fill(str) OVER w AS str, gap_fill(val) OVER w AS val
        , gap_fill(col1) OVER w AS col1
   FROM   tbl
   WINDOW w AS (PARTITION BY id ORDER BY row_num)
   ) u
WHERE t.row_num = u.row_num
AND  (t.str, t.val, t.col1) IS DISTINCT FROM
     (u.str, u.val, u.col1)

使用 pg_catalog.pg_attribute 而不是信息架构。参见：

"Information schema vs. system catalogs"

请注意最后的 WHERE 子句以防止（可能昂贵）空更新。只会写入实际更改的行。参见：

How do I (or can I) SELECT DISTINCT on multiple columns?

此外，甚至只会考虑可为空的列（未定义 NOT NULL），以避免不必要的工作。

在 UPDATE 中使用 ROW 语法以保持代码简单。参见：

SQL update fields of one table from fields of another one

函数 return 有两个整数值：nullable_columns 和 updated_rows，报告顾名思义。

函数正确防御SQL注入。参见：

Table name as a PostgreSQL function parameter
SQL injection in Postgres functions vs prepared queries

关于GET DIAGNOSTICS：

Calculate number of rows affected by batch query in PostgreSQL

以上函数更新，但不更新 return 行。这是一个基本演示，如何 return 不同类型的行：

CREATE OR REPLACE FUNCTION f_gap_fill_select(_tbl_type anyelement, _id text, _row_num text)
  RETURNS SETOF anyelement
  LANGUAGE plpgsql AS
$func$
DECLARE
   _tbl regclass := pg_typeof(_tbl_type)::text::regclass;
   _sql text;
BEGIN   
   SELECT INTO _sql
          'SELECT ' || string_agg(CASE WHEN a.attnotnull
                                  THEN format('%I', a.attname)
                                  ELSE format('gap_fill(%1$I) OVER w AS %1$I', a.attname) END
                                , ', ' ORDER BY a.attnum)
        || E'\nFROM ' || _tbl
        || format(E'\nWINDOW w AS (PARTITION BY %I ORDER BY %I)', _id, _row_num)
   FROM   pg_attribute a
   WHERE  a.attrelid = _tbl
   AND    a.attnum > 0
   AND    NOT a.attisdropped;
   
   IF _sql IS NULL THEN
      RAISE EXCEPTION 'SQL string is NULL. Should not occur!';
   END IF;

   RETURN QUERY EXECUTE _sql;
   -- RAISE NOTICE '%', _sql;       -- debug
END
$func$;

调用（注意特殊语法！）：

SELECT * FROM f_gap_fill_select(NULL::example, 'id', 'row_num');

db<>fiddle here

关于 return 多态行类型：

Refactor a PL/pgSQL function to return the output of various SELECT queries

动态地将函数应用于 Postgres table 中的所有列

Apply function to all columns in a Postgres table dynamically

sql

postgresql

null

dynamic-sql

plpgsql