将随机生成的数据插入 PostgreSQL 极其缓慢;
Insert of randomly generated data into PostgreSQL extremely slow;
出于测试目的,我需要将一个非常大的数据集插入 table。我已经为此目的创建了一个脚本来生成纯随机数据。它有效,但执行速度太慢,以至于我将在这里待一个月的星期天等待完成。
一些细节:
- 来源 table 有 1.5 亿条记录
- 有三个statistics_per_source
- 睡眠时间为 5
- 目前插入 1000 次大约需要 4 分钟
- PostgreSQL-12
问题:我可以采取什么步骤来使下面的脚本运行得比现在快得多,或者我会采取什么替代方法来插入如此大的随机数据?
TRUNCATE TABLE public.statistic RESTART IDENTITY;
SELECT 'Creating View Statistics for Sources' as progress;
DO $$
DECLARE
sleep integer;
sps integer;
start integer := 1;
increment integer;
remaining integer := increment;
BEGIN
SELECT sleep INTO sleep FROM Constants;
SELECT statistics_per_source INTO sps FROM Constants;
SELECT commit_chunk_size INTO increment FROM Constants;
INSERT INTO Progress(dt, operation, progress) VALUES (now(), 'statistics from source', 'BEGIN INSERT');
LOOP
SELECT count(*) INTO remaining FROM source WHERE id > start LIMIT 1;
EXIT WHEN remaining = 0;
INSERT INTO Progress(dt, operation, progress) VALUES (now(), 'statistics from source', 'Beginning Source=' || start);
INSERT INTO statistic(created, value, classtype, source_id, source_created, brand)
SELECT
date(src.created + trunc(random() * 20) * '1 day'::interval) created,
(random() * 100000)::int,
CASE WHEN (random() > 0.5) THEN 'Views' ELSE 'CTR' END,
src.id,
src.created,
NULL
FROM source src
CROSS JOIN
(SELECT generate_series(1, sps) as value ) s
WHERE src.id between start + 1 and start + increment;
INSERT INTO Progress(dt, operation, progress) VALUES (now(), 'statistics from source', 'Committing source=' || start);
COMMIT;
PERFORM pg_sleep(sleep);
start := start + increment;
END LOOP ;
END $$;
table 看起来像这样;我现在有意避免为插入性能创建索引。
CREATE TABLE public.statistic
(
id bigint NOT NULL GENERATED ALWAYS AS IDENTITY ( INCREMENT 1 START 1 MINVALUE 1 MAXVALUE 9223372036854775807 CACHE 1 ),
created date NOT NULL,
value double precision NOT NULL,
classtype text COLLATE pg_catalog."default",
data_source integer,
production integer,
source_id bigint,
source_created date,
brand integer,
CONSTRAINT statistics_pk PRIMARY KEY (id)
)
使用 where 条件执行计数 (*) 可能会减慢速度,您可以在插入 @@rowcount 为零时退出循环
出于测试目的,我需要将一个非常大的数据集插入 table。我已经为此目的创建了一个脚本来生成纯随机数据。它有效,但执行速度太慢,以至于我将在这里待一个月的星期天等待完成。
一些细节:
- 来源 table 有 1.5 亿条记录
- 有三个statistics_per_source
- 睡眠时间为 5
- 目前插入 1000 次大约需要 4 分钟
- PostgreSQL-12
问题:我可以采取什么步骤来使下面的脚本运行得比现在快得多,或者我会采取什么替代方法来插入如此大的随机数据?
TRUNCATE TABLE public.statistic RESTART IDENTITY;
SELECT 'Creating View Statistics for Sources' as progress;
DO $$
DECLARE
sleep integer;
sps integer;
start integer := 1;
increment integer;
remaining integer := increment;
BEGIN
SELECT sleep INTO sleep FROM Constants;
SELECT statistics_per_source INTO sps FROM Constants;
SELECT commit_chunk_size INTO increment FROM Constants;
INSERT INTO Progress(dt, operation, progress) VALUES (now(), 'statistics from source', 'BEGIN INSERT');
LOOP
SELECT count(*) INTO remaining FROM source WHERE id > start LIMIT 1;
EXIT WHEN remaining = 0;
INSERT INTO Progress(dt, operation, progress) VALUES (now(), 'statistics from source', 'Beginning Source=' || start);
INSERT INTO statistic(created, value, classtype, source_id, source_created, brand)
SELECT
date(src.created + trunc(random() * 20) * '1 day'::interval) created,
(random() * 100000)::int,
CASE WHEN (random() > 0.5) THEN 'Views' ELSE 'CTR' END,
src.id,
src.created,
NULL
FROM source src
CROSS JOIN
(SELECT generate_series(1, sps) as value ) s
WHERE src.id between start + 1 and start + increment;
INSERT INTO Progress(dt, operation, progress) VALUES (now(), 'statistics from source', 'Committing source=' || start);
COMMIT;
PERFORM pg_sleep(sleep);
start := start + increment;
END LOOP ;
END $$;
table 看起来像这样;我现在有意避免为插入性能创建索引。
CREATE TABLE public.statistic
(
id bigint NOT NULL GENERATED ALWAYS AS IDENTITY ( INCREMENT 1 START 1 MINVALUE 1 MAXVALUE 9223372036854775807 CACHE 1 ),
created date NOT NULL,
value double precision NOT NULL,
classtype text COLLATE pg_catalog."default",
data_source integer,
production integer,
source_id bigint,
source_created date,
brand integer,
CONSTRAINT statistics_pk PRIMARY KEY (id)
)
使用 where 条件执行计数 (*) 可能会减慢速度,您可以在插入 @@rowcount 为零时退出循环