基于过滤器值的查询的 Postgres 性能问题

Postgres performance issue with query based on filter values

我不是 Postgres 方面的专家,但我正在尝试理解这种奇怪的行为,也许你们中的一些人可以给我一些见解。

涉及到的表和索引就这些

    CREATE TABLE swp_am_hcbe_pro.submissions
    (
        id bigint NOT NULL DEFAULT nextval('swp_am_hcbe_pro.submissions_id_seq'::regclass),
        application_id bigint NOT NULL,
        transaction_names_id bigint NOT NULL,
        "timestamp" timestamp without time zone NOT NULL,
        submission_status character varying(32) COLLATE pg_catalog."default" NOT NULL,
        submission_type character varying(16) COLLATE pg_catalog."default" NOT NULL,
        exit_code character varying(32) COLLATE pg_catalog."default",
        ignore_partner_status boolean NOT NULL DEFAULT false,
        ignore_sell_partner_status boolean NOT NULL DEFAULT false,
        ignore_exclusion_rules boolean NOT NULL DEFAULT false,
        dpa_iban character varying(32) COLLATE pg_catalog."default",
        dpa_bic character varying(32) COLLATE pg_catalog."default",
        dpa_id bigint,
        dpa_blz bigint,
        dda_iban character varying(32) COLLATE pg_catalog."default",
        dda_bic character varying(32) COLLATE pg_catalog."default",
        dda_id bigint,
        dda_blz bigint,
        dda_sepa_mandate_ref character varying(128) COLLATE pg_catalog."default",
        use_different_sepa_mandate character varying(34) COLLATE pg_catalog."default",
        use_manual_limit_extension boolean NOT NULL DEFAULT false,
        use_automatic_limit_extension boolean NOT NULL DEFAULT false,
        json_payload text COLLATE pg_catalog."default" NOT NULL,
        final_timestamp timestamp without time zone,
        CONSTRAINT submissions_pkey PRIMARY KEY (id),
        CONSTRAINT submission_app_id FOREIGN KEY (application_id)
            REFERENCES swp_am_hcbe_pro.applications (id) MATCH SIMPLE
            ON UPDATE NO ACTION
            ON DELETE CASCADE,
        CONSTRAINT submission_transaction_names_id FOREIGN KEY (transaction_names_id)
            REFERENCES swp_am_hcbe_pro.transaction_names (id) MATCH SIMPLE
            ON UPDATE NO ACTION
            ON DELETE NO ACTION,
        CONSTRAINT chk_submission_status CHECK (submission_status::text = ANY (ARRAY['ERROR'::character varying, 'DENIED'::character varying, 'PROCESSED'::character varying, 'REJECTED'::character varying, 'PROCESSING'::character varying, 'SCHEDULED'::character varying]::text[])),
        CONSTRAINT submission_types CHECK (submission_type::text = ANY (ARRAY['AUTO'::character varying, 'MANUAL'::character varying]::text[]))
    )
    WITH (
        OIDS = FALSE
    )
    TABLESPACE pg_default;

CREATE TABLE swp_am_hcbe_pro.applications
(
    id bigint NOT NULL DEFAULT nextval('swp_am_hcbe_pro.applications_id_seq'::regclass),
    correlation_id character varying(64) COLLATE pg_catalog."default" NOT NULL,
    incoming_timestamp timestamp without time zone NOT NULL,
    source_input character varying(16) COLLATE pg_catalog."default" NOT NULL,
    source_file_path character varying(255) COLLATE pg_catalog."default",
    application_type character varying(127) COLLATE pg_catalog."default" NOT NULL,
    loan_id bigint,
    vin character varying(17) COLLATE pg_catalog."default",
    cooperation_name character varying(255) COLLATE pg_catalog."default",
    cooperation_id bigint,
    submitter_name character varying(255) COLLATE pg_catalog."default",
    submitter_id bigint,
    dealer_name character varying(255) COLLATE pg_catalog."default",
    dealer_id bigint,
    dealer_ext_id character varying(25) COLLATE pg_catalog."default",
    invoice_id character varying(25) COLLATE pg_catalog."default",
    stock_id character varying(20) COLLATE pg_catalog."default",
    payment_term character varying(20) COLLATE pg_catalog."default",
    reg_document_id character varying(25) COLLATE pg_catalog."default",
    invoice_amount numeric(20,4),
    application_status character varying(64) COLLATE pg_catalog."default",
    dealer_group_id bigint,
    approver text COLLATE pg_catalog."default",
    approve_timestamp timestamp without time zone,
    payload text COLLATE pg_catalog."default" NOT NULL,
    auto_resub_attempts integer NOT NULL DEFAULT 0,
    row_number bigint,
    email_sent boolean DEFAULT false,
    modified_date timestamp(6) without time zone DEFAULT CURRENT_TIMESTAMP,
    product_name text COLLATE pg_catalog."default",
    priority smallint,
    CONSTRAINT applications_pkey PRIMARY KEY (id),
    CONSTRAINT chk_application_status CHECK (application_status::text = ANY (ARRAY['PROCESSED'::character varying, 'PROCESSING'::character varying, 'WAIT_NEXT_SUBMISSION'::character varying, 'WAIT_MANUAL_SUBMISSION'::character varying, 'WAIT_AUTOMATIC_SUBMISSION'::character varying, 'WAIT_IN_QUEUE'::character varying, 'SUBMISSION_NOT_FOUND'::character varying, 'WAIT_FOR_ASYNC_ACTIVATION'::character varying, 'WAIT_FOR_ASYNC_SHIPMENT'::character varying, 'WAIT_FOR_BOOKING_CONFIRMATION'::character varying, 'WAIT_FOR_ACTIVATION_CONFIRMATION'::character varying, 'REJECTED'::character varying, 'NOT_IN_QUEUE'::character varying, 'SCHEDULED'::character varying]::text[])),
    CONSTRAINT chk_source CHECK (source_input::text = ANY (ARRAY['LM'::character varying, 'KOSYFA'::character varying, 'SWPII'::character varying, 'ADM'::character varying]::text[]))
)
WITH (
    OIDS = FALSE
)
TABLESPACE pg_default;

CREATE TABLE swp_am_hcbe_pro.transaction_names
(
    id bigint NOT NULL DEFAULT nextval('swp_am_hcbe_pro.transaction_names_id_seq'::regclass),
    name character varying(32) COLLATE pg_catalog."default" NOT NULL,
    sub_name character varying(32) COLLATE pg_catalog."default",
    CONSTRAINT transaction_names_pkey PRIMARY KEY (id)
)
WITH (
    OIDS = FALSE
)
TABLESPACE pg_default;

索引

CREATE INDEX submissions_app_id_asc_timestamp_desc_idx
    ON swp_am_hcbe_pro.submissions USING btree
    (application_id, "timestamp" DESC)
    TABLESPACE pg_default;

CREATE INDEX submissions_app_id_timestamp_trans_name_id_idx
    ON swp_am_hcbe_pro.submissions USING btree
    (application_id, "timestamp", transaction_names_id)
    TABLESPACE pg_default;

CREATE INDEX submissions_timestamp_asc_app_id_asc_idx
    ON swp_am_hcbe_pro.submissions USING btree
    ("timestamp", application_id)
    TABLESPACE pg_default;

CREATE INDEX application_correlation_id_idx
    ON swp_am_hcbe_pro.applications USING btree
    (correlation_id COLLATE pg_catalog."default")
    TABLESPACE pg_default;

CREATE INDEX application_correlation_row_number_idx
    ON swp_am_hcbe_pro.applications USING btree
    (correlation_id COLLATE pg_catalog."default", row_number)
    TABLESPACE pg_default;

CREATE INDEX applications_application_status_idx
    ON swp_am_hcbe_pro.applications USING btree
    (application_status COLLATE pg_catalog."default")
    TABLESPACE pg_default;

CREATE INDEX applications_invoice_idx
    ON swp_am_hcbe_pro.applications USING btree
    (invoice_id COLLATE pg_catalog."default")
    TABLESPACE pg_default;

CREATE INDEX applications_vin_idx
    ON swp_am_hcbe_pro.applications USING btree
    (vin COLLATE pg_catalog."default")
    TABLESPACE pg_default;

我有以下看法

CREATE OR REPLACE VIEW swp_am_hcbe_pro.application_list_simple AS
 WITH subm AS (
         SELECT DISTINCT ON (s.application_id) s.application_id,
            s."timestamp",
            s.exit_code,
            s.transaction_names_id
           FROM swp_am_hcbe_pro.submissions s
          ORDER BY s.application_id, s."timestamp" DESC
        )
 SELECT app.id,
    app.correlation_id,
    app.source_input,
    app.source_file_path,
    app.application_type,
    app.loan_id,
    app.vin,
    app.cooperation_name,
    app.cooperation_id,
    app.submitter_name,
    app.submitter_id,
    app.dealer_id,
    app.dealer_name,
    app.dealer_ext_id,
    app.invoice_id,
    app.stock_id,
    app.payment_term,
    app.reg_document_id,
    app.invoice_amount,
    app.application_status,
    app.incoming_timestamp,
    app.dealer_group_id,
    app.approver,
    app.approve_timestamp,
    subm.exit_code,
    tn.name AS transaction_name,
    tn.sub_name AS sub_transaction_name,
    tn.id AS transaction_type_id,
    subm."timestamp" AS last_submission_timestamp,
    app.modified_date
   FROM swp_am_hcbe_pro.applications app
     LEFT JOIN subm ON app.id = subm.application_id
     LEFT JOIN swp_am_hcbe_pro.transaction_names tn ON tn.id = subm.transaction_names_id;

如果我运行这个语句,经过的时间是:Execution time: 2481.333 ms

explain analyze 
SELECT *, count(*) OVER () AS total FROM swp_am_hcbe_pro.application_list_simple 
WHERE INCOMING_TIMESTAMP >= '2021-11-08' AND INCOMING_TIMESTAMP <= '2021-11-09' 
ORDER BY APPROVE_TIMESTAMP DESC, INCOMING_TIMESTAMP DESC LIMIT 100 OFFSET 0
;

我得到了以下内容

"Limit  (cost=461799.85..461800.10 rows=100 width=490) (actual time=2473.878..2474.618 rows=100 loops=1)"
"  ->  Sort  (cost=461799.85..461803.13 rows=1311 width=490) (actual time=2473.877..2474.612 rows=100 loops=1)"
"        Sort Key: app.approve_timestamp DESC, app.incoming_timestamp DESC"
"        Sort Method: top-N heapsort  Memory: 112kB"
"        ->  WindowAgg  (cost=458791.38..461749.74 rows=1311 width=490) (actual time=2471.792..2473.247 rows=1620 loops=1)"
"              ->  Hash Left Join  (cost=458791.38..461720.25 rows=1311 width=482) (actual time=2456.132..2470.895 rows=1620 loops=1)"
"                    Hash Cond: (subm.transaction_names_id = tn.id)"
"                    CTE subm"
"                      ->  Unique  (cost=0.43..333656.64 rows=129297 width=31) (actual time=0.036..1846.992 rows=645062 loops=1)"
"                            ->  Index Scan using submissions_app_id_asc_timestamp_desc_idx on submissions s  (cost=0.43..329433.26 rows=1689349 width=31) (actual time=0.033..1621.049 rows=1699582 loops=1)"
"                    ->  Hash Right Join  (cost=125133.09..128058.44 rows=1311 width=459) (actual time=2456.083..2470.337 rows=1620 loops=1)"
"                          Hash Cond: (subm.application_id = app.id)"
"                          ->  CTE Scan on subm  (cost=0.00..2585.94 rows=129297 width=106) (actual time=0.038..2135.256 rows=645062 loops=1)"
"                          ->  Hash  (cost=125116.71..125116.71 rows=1311 width=361) (actual time=237.582..238.310 rows=1620 loops=1)"
"                                Buckets: 2048  Batches: 1  Memory Usage: 483kB"
"                                ->  Gather  (cost=1000.00..125116.71 rows=1311 width=361) (actual time=11.959..236.468 rows=1620 loops=1)"
"                                      Workers Planned: 2"
"                                      Workers Launched: 2"
"                                      ->  Parallel Seq Scan on applications app  (cost=0.00..123985.61 rows=546 width=361) (actual time=2.880..97.484 rows=540 loops=3)"
"                                            Filter: ((incoming_timestamp >= '2021-11-08 00:00:00'::timestamp without time zone) AND (incoming_timestamp <= '2021-11-09 00:00:00'::timestamp without time zone))"
"                                            Rows Removed by Filter: 214530"
"                    ->  Hash  (cost=1.29..1.29 rows=29 width=31) (actual time=0.033..0.033 rows=29 loops=1)"
"                          Buckets: 1024  Batches: 1  Memory Usage: 10kB"
"                          ->  Seq Scan on transaction_names tn  (cost=0.00..1.29 rows=29 width=31) (actual time=0.011..0.015 rows=29 loops=1)"
"Planning time: 0.587 ms"
"Execution time: 2481.333 ms"

如果我运行这个只改变日期过滤器,需要Execution time: 365817.271 ms

explain analyze 
SELECT *, count(*) OVER () AS total FROM swp_am_hcbe_pro.application_list_simple 
WHERE INCOMING_TIMESTAMP >= '2021-11-09' AND INCOMING_TIMESTAMP <= '2021-11-10' 
ORDER BY APPROVE_TIMESTAMP DESC, INCOMING_TIMESTAMP DESC LIMIT 100 OFFSET 0
;

"Limit  (cost=462844.68..462844.69 rows=1 width=490) (actual time=365809.554..365810.419 rows=100 loops=1)"
"  ->  Sort  (cost=462844.68..462844.69 rows=1 width=490) (actual time=365809.553..365810.411 rows=100 loops=1)"
"        Sort Key: app.approve_timestamp DESC, app.incoming_timestamp DESC"
"        Sort Method: top-N heapsort  Memory: 125kB"
"        ->  WindowAgg  (cost=334656.77..462844.67 rows=1 width=490) (actual time=365806.595..365808.483 rows=2140 loops=1)"
"              ->  Nested Loop Left Join  (cost=334656.77..462844.65 rows=1 width=482) (actual time=2094.856..365793.839 rows=2140 loops=1)"
"                    CTE subm"
"                      ->  Unique  (cost=0.43..333656.64 rows=129297 width=31) (actual time=0.036..1771.818 rows=645068 loops=1)"
"                            ->  Index Scan using submissions_app_id_asc_timestamp_desc_idx on submissions s  (cost=0.43..329433.26 rows=1689349 width=31) (actual time=0.034..1563.614 rows=1699595 loops=1)"
"                    ->  Nested Loop Left Join  (cost=1000.00..129187.86 rows=1 width=459) (actual time=2094.836..365762.361 rows=2140 loops=1)"
"                          Join Filter: (app.id = subm.application_id)"
"                          Rows Removed by Join Filter: 1380443382"
"                          ->  Gather  (cost=1000.00..124985.71 rows=1 width=361) (actual time=8.475..33.996 rows=2140 loops=1)"
"                                Workers Planned: 2"
"                                Workers Launched: 2"
"                                ->  Parallel Seq Scan on applications app  (cost=0.00..123985.61 rows=1 width=361) (actual time=1.809..103.597 rows=713 loops=3)"
"                                      Filter: ((incoming_timestamp >= '2021-11-09 00:00:00'::timestamp without time zone) AND (incoming_timestamp <= '2021-11-10 00:00:00'::timestamp without time zone))"
"                                      Rows Removed by Filter: 214359"
"                          ->  CTE Scan on subm  (cost=0.00..2585.94 rows=129297 width=106) (actual time=0.030..125.740 rows=645068 loops=2140)"
"                    ->  Index Scan using transaction_names_pkey on transaction_names tn  (cost=0.14..0.16 rows=1 width=31) (actual time=0.009..0.009 rows=1 loops=2140)"
"                          Index Cond: (id = subm.transaction_names_id)"
"Planning time: 0.414 ms"
"Execution time: 365817.271 ms"

我真的不明白为什么会这样。我还尝试 运行 查询过滤器获得多个日期(例如一周,一个月)并且所有这些都工作正常。

我清理受影响的表,即使没有那么多行。我还能检查什么?

如果您需要更多信息,请随时问我

更新

如果我将查询更改为此,在字符串上使用 to_timestamp,那么它将起作用。但为什么它在所有其他情况下都有效,而在这个情况下却无效?为什么总是发生在当前日期?

explain analyze
SELECT * FROM swp_am_hcbe_pro.application_list_simple 
WHERE INCOMING_TIMESTAMP >= to_timestamp('2021-11-09 00:00:00','YYYY-MM-DD HH24:MI:SS')  
AND INCOMING_TIMESTAMP <= to_timestamp('2021-11-10 00:00:00','YYYY-MM-DD HH24:MI:SS') 
ORDER BY APPROVE_TIMESTAMP DESC, INCOMING_TIMESTAMP DESC LIMIT 100 OFFSET 0 ;

我得到以下

"Limit  (cost=463151.72..463151.97 rows=100 width=481) (actual time=2743.036..2743.923 rows=100 loops=1)"
"  ->  Sort  (cost=463151.72..463153.01 rows=517 width=481) (actual time=2743.035..2743.918 rows=100 loops=1)"
"        Sort Key: app.approve_timestamp DESC, app.incoming_timestamp DESC"
"        Sort Method: top-N heapsort  Memory: 121kB"
"        ->  Hash Left Join  (cost=460200.05..463126.79 rows=517 width=481) (actual time=2730.684..2741.744 rows=2382 loops=1)"
"              Hash Cond: (subm.transaction_names_id = tn.id)"
"              CTE subm"
"                ->  Unique  (cost=0.43..333658.84 rows=129297 width=31) (actual time=0.020..1669.678 rows=645311 loops=1)"
"                      ->  Index Scan using submissions_app_id_asc_timestamp_desc_idx on submissions s  (cost=0.43..329435.46 rows=1689349 width=31) (actual time=0.019..1476.827 rows=1700028 loops=1)"
"              ->  Hash Right Join  (cost=126539.56..129464.91 rows=517 width=458) (actual time=2730.642..2740.999 rows=2382 loops=1)"
"                    Hash Cond: (subm.application_id = app.id)"
"                    ->  CTE Scan on subm  (cost=0.00..2585.94 rows=129297 width=106) (actual time=0.023..1924.458 rows=645311 loops=1)"
"                    ->  Hash  (cost=126533.10..126533.10 rows=517 width=360) (actual time=736.655..737.534 rows=2382 loops=1)"
"                          Buckets: 4096 (originally 1024)  Batches: 1 (originally 1)  Memory Usage: 864kB"
"                          ->  Gather  (cost=1000.00..126533.10 rows=517 width=360) (actual time=18.882..734.265 rows=2382 loops=1)"
"                                Workers Planned: 2"
"                                Workers Launched: 2"
"                                ->  Parallel Seq Scan on applications app  (cost=0.00..125481.40 rows=215 width=360) (actual time=15.908..610.513 rows=794 loops=3)"
"                                      Filter: ((incoming_timestamp >= to_timestamp('2021-11-09 00:00:00'::text, 'YYYY-MM-DD HH24:MI:SS'::text)) AND (incoming_timestamp <= to_timestamp('2021-11-10 00:00:00'::text, 'YYYY-MM-DD HH24:MI:SS'::text)))"
"                                      Rows Removed by Filter: 214359"
"              ->  Hash  (cost=1.29..1.29 rows=29 width=31) (actual time=0.026..0.026 rows=29 loops=1)"
"                    Buckets: 1024  Batches: 1  Memory Usage: 10kB"
"                    ->  Seq Scan on transaction_names tn  (cost=0.00..1.29 rows=29 width=31) (actual time=0.012..0.018 rows=29 loops=1)"
"Planning time: 0.370 ms"
"Execution time: 2751.279 ms"

那么,问题依旧

为什么这个查询需要 360 秒?

SELECT * FROM swp_am_hcbe_pro.application_list_simple 
    WHERE INCOMING_TIMESTAMP >= '2021-11-09' AND INCOMING_TIMESTAMP <= '2021-11-10' 
    ORDER BY APPROVE_TIMESTAMP DESC, INCOMING_TIMESTAMP DESC LIMIT 100 OFFSET 0
    ;

但是这个需要3秒

SELECT * FROM swp_am_hcbe_pro.application_list_simple 
WHERE INCOMING_TIMESTAMP >= to_timestamp('2021-11-09 00:00:00','YYYY-MM-DD HH24:MI:SS')  
AND INCOMING_TIMESTAMP <= to_timestamp('2021-11-10 00:00:00','YYYY-MM-DD HH24:MI:SS') 
ORDER BY APPROVE_TIMESTAMP DESC, INCOMING_TIMESTAMP DESC LIMIT 100 OFFSET 0 ;

其他情况不管我用不用都有效to_timestamp。请注意,我在上次更新中删除了 count(*) over() 以表明它不相关,所以问题仍然存在。

感谢您的支持

尝试使用 (TEMP) 视图而不是 CTE 来避免(非索引)CTE 扫描 [我还用 NOT EXISTS(...) 替换了 DISTINCT ON(...) ]:


CREATE OR REPLACE VIEW vsubm AS
         SELECT -- DISTINCT ON (s.application_id)
         s.application_id
            , s.ztimestamp
            , s.exit_code
            , s.transaction_names_id
        FROM submissions s
        WHERE NOT EXISTS ( SELECT *
                FROM submissions nx
                WHERE nx.application_id = s.application_id
                AND nx.ztimestamp > s.ztimestamp
                )
        -- ORDER BY s.application_id, s.ztimestamp DESC
        ;

CREATE OR REPLACE VIEW application_list_simple2 AS
 SELECT app.id
    , app.correlation_id
    , app.source_input
    , app.source_file_path
    , app.application_type
    , app.loan_id
    , app.vin
    , app.cooperation_name
    , app.cooperation_id
    , app.submitter_name
    , app.submitter_id
    , app.dealer_id
    , app.dealer_name
    , app.dealer_ext_id
    , app.invoice_id
    , app.stock_id
    , app.payment_term
    , app.reg_document_id
    , app.invoice_amount
    , app.application_status
    , app.incoming_timestamp AS INCOMING_TIMESTAMP
    , app.dealer_group_id
    , app.approver
    , app.approve_timestamp AS APPROVE_TIMESTAMP
    , vsubm.exit_code
    , tn.name AS transaction_name
    , tn.sub_name AS sub_transaction_name
    , tn.id AS transaction_type_id
    , vsubm.ztimestamp AS last_submission_timestamp
    , app.modified_date
   FROM applications app

     LEFT JOIN vsubm ON app.id = vsubm.application_id
     LEFT JOIN transaction_names tn ON tn.id = vsubm.transaction_names_id
        ;

-- EXPLAIN
-- explain analyze
SELECT *
        -- , count(*) OVER () AS total
FROM application_list_simple2
WHERE INCOMING_TIMESTAMP >= '2021-11-08' AND INCOMING_TIMESTAMP < '2021-11-09'
ORDER BY APPROVE_TIMESTAMP DESC, INCOMING_TIMESTAMP DESC
-- LIMIT 100 OFFSET 0

WRT 观察到的行为:

  • 在时间 的边缘选择日期范围可能会导致生成不同的计划。不同于中间的一个时间跨度
  • 可能今天的记录统计还不完整(统计收集器可能落后了)
  • 糟糕的计划(大量散列连接和序列扫描)可能是由于缺少统计信息、缺少索引或random_page_cost设置为高造成的。
  • table 的行大小相当大。也许需要一些标准化,特别是对于 applications table.
  • 混合时间戳 with/without 时区可能会造成一些混乱。 [一般建议:始终使用时间戳 时区]