如何优化用于计算词频的 SQL 查询?
How can I optimize an SQL query for calculating word frequency?
我正在尝试填充两个 tables:
token
:
word | df(the number of documents containing a word)
==========
"dog" | 5
"cat" | 2
"horse"| 1
token_count
:
tokenid | docid| tf(the number of times a word occurs in a document)
====================
1 | 1 | 6
2 | 2 | 2
3 | 2 | 1
使用来自 documents
的数据:
id | title | body
=============================
1 | "dog" | "about dogs"
2 | "cats" | "about cats"
为此,我使用 ts_stat( 'select to_tsvector(''english'', body) from documents' )
which returns a table 以及单词的文档频率以及单词在整个列中出现的次数。虽然第二列正是 token
table 所需要的,但第三列显示了整列的文档频率。
word | ndoc | nentry
====================
dog | 5 | 6
cat | 2 | 2
horse| 1 | 1
此代码填充 token
table 并在 3 秒内完成一百个文档。
INSERT INTO token (word, document_frequency)
SELECT
word,
ndoc
FROM
ts_stat( 'select to_tsvector(''english'', body) from documents' );
我尝试 运行 在包含 15 个文档的较小数据集上使用以下代码并且它有效但是当我尝试 运行 在当前数据集(100 个文档)上使用它时它永远不会停止运行宁.
WITH temp_data AS (
SELECT id ,
(ts_stat('select to_tsvector(''english'', body) from documents where id='||id)).*
FROM documents
)
INSERT INTO token_count (docid, tokenid, tf)
SELECT
id,
(SELECT id FROM token WHERE word = temp_data.word LIMIT 1),
nentry
FROM temp_data;
如何优化此查询?
对包含 15 个文档的数据集进行解释分析:
"Insert on token_count (cost=1023803.22..1938766428.23 rows=9100000 width=28) (actual time=59875.204..59875.206 rows=0 loops=1)"
" CTE temp_data"
" -> Result (cost=0.00..1023803.22 rows=9100000 width=44) (actual time=0.144..853.320 rows=42449 loops=1)"
" -> ProjectSet (cost=0.00..45553.23 rows=9100000 width=36) (actual time=0.142..809.366 rows=42449 loops=1)"
" -> Seq Scan on wikitable (cost=0.00..19.10 rows=910 width=4) (actual time=0.010..0.029 rows=16 loops=1)"
" -> CTE Scan on temp_data (cost=0.00..1937742625.00 rows=9100000 width=28) (actual time=0.509..59652.279 rows=42449 loops=1)"
" SubPlan 2"
" -> Limit (cost=0.00..212.92 rows=1 width=4) (actual time=1.381..1.381 rows=1 loops=42449)"
" -> Seq Scan on token (cost=0.00..425.84 rows=2 width=4) (actual time=1.372..1.372 rows=1 loops=42449)"
" Filter: ((word)::text = temp_data.word)"
" Rows Removed by Filter: 10384"
"Planning Time: 0.202 ms"
"Execution Time: 59876.350 ms"
对包含 30 个文档的数据集进行解释分析:
"Insert on token_count (cost=1023803.22..6625550803.23 rows=9100000 width=28) (actual time=189910.438..189910.439 rows=0 loops=1)"
" CTE temp_data"
" -> Result (cost=0.00..1023803.22 rows=9100000 width=44) (actual time=0.191..2018.758 rows=92168 loops=1)"
" -> ProjectSet (cost=0.00..45553.23 rows=9100000 width=36) (actual time=0.189..1919.726 rows=92168 loops=1)"
" -> Seq Scan on wikitable (cost=0.00..19.10 rows=910 width=4) (actual time=0.013..0.053 rows=31 loops=1)"
" -> CTE Scan on temp_data (cost=0.00..6624527000.00 rows=9100000 width=28) (actual time=1.009..189412.022 rows=92168 loops=1)"
" SubPlan 2"
" -> Limit (cost=0.00..727.95 rows=1 width=4) (actual time=2.029..2.029 rows=1 loops=92168)"
" -> Seq Scan on token (cost=0.00..727.95 rows=1 width=4) (actual time=2.020..2.020 rows=1 loops=92168)"
" Filter: ((word)::text = temp_data.word)"
" Rows Removed by Filter: 16463"
"Planning Time: 0.234 ms"
"Execution Time: 189913.688 ms"
这是一个不使用 ts_stat 来获取字数的演示。
相反,它使用横向连接来取消嵌套 ts_vector。
create table documents (
document_id serial primary key,
title varchar(30) not null,
body text not null
);
insert into documents (title, body) values
('dogs', 'the dog barked at the cat, but the cat ignored her.')
, ('cats', 'cats kill more birds than dogs kill cats')
create table tokens (
token_id serial primary key,
word varchar(30),
df int
)
insert into tokens (word, df)
SELECT word, ndoc
FROM ts_stat('select to_tsvector(''english'', body) from documents');
select * from tokens order by df desc
token_id | word | df
-------: | :---- | -:
3 | dog | 2
4 | cat | 2
1 | kill | 1
2 | ignor | 1
5 | bird | 1
6 | bark | 1
create table token_counts (
document_id int,
token_id int,
tf int,
primary key (document_id, token_id),
foreign key (document_id) references documents(document_id),
foreign key (token_id) references tokens(token_id)
);
INSERT INTO token_counts (
document_id,
token_id,
tf
)
select
doc.document_id,
tok.token_id,
lex.total
from documents as doc
cross join lateral (
select lexeme, cardinality(positions) as total
from unnest(to_tsvector('english', doc.body)) as tsvector
) as lex
inner join tokens as tok
on tok.word = lex.lexeme;
select title, word, tf
from token_counts cnt
join documents doc using(document_id)
join tokens tok using(token_id)
order by document_id, token_id
title
word
tf
dogs
ignor
1
dogs
dog
1
dogs
cat
2
dogs
bark
1
cats
kill
2
cats
dog
1
cats
cat
2
cats
bird
1
演示 db<>fiddle here
我正在尝试填充两个 tables:
token
:
word | df(the number of documents containing a word)
==========
"dog" | 5
"cat" | 2
"horse"| 1
token_count
:
tokenid | docid| tf(the number of times a word occurs in a document)
====================
1 | 1 | 6
2 | 2 | 2
3 | 2 | 1
使用来自 documents
的数据:
id | title | body
=============================
1 | "dog" | "about dogs"
2 | "cats" | "about cats"
为此,我使用 ts_stat( 'select to_tsvector(''english'', body) from documents' )
which returns a table 以及单词的文档频率以及单词在整个列中出现的次数。虽然第二列正是 token
table 所需要的,但第三列显示了整列的文档频率。
word | ndoc | nentry
====================
dog | 5 | 6
cat | 2 | 2
horse| 1 | 1
此代码填充 token
table 并在 3 秒内完成一百个文档。
INSERT INTO token (word, document_frequency)
SELECT
word,
ndoc
FROM
ts_stat( 'select to_tsvector(''english'', body) from documents' );
我尝试 运行 在包含 15 个文档的较小数据集上使用以下代码并且它有效但是当我尝试 运行 在当前数据集(100 个文档)上使用它时它永远不会停止运行宁.
WITH temp_data AS (
SELECT id ,
(ts_stat('select to_tsvector(''english'', body) from documents where id='||id)).*
FROM documents
)
INSERT INTO token_count (docid, tokenid, tf)
SELECT
id,
(SELECT id FROM token WHERE word = temp_data.word LIMIT 1),
nentry
FROM temp_data;
如何优化此查询?
对包含 15 个文档的数据集进行解释分析:
"Insert on token_count (cost=1023803.22..1938766428.23 rows=9100000 width=28) (actual time=59875.204..59875.206 rows=0 loops=1)"
" CTE temp_data"
" -> Result (cost=0.00..1023803.22 rows=9100000 width=44) (actual time=0.144..853.320 rows=42449 loops=1)"
" -> ProjectSet (cost=0.00..45553.23 rows=9100000 width=36) (actual time=0.142..809.366 rows=42449 loops=1)"
" -> Seq Scan on wikitable (cost=0.00..19.10 rows=910 width=4) (actual time=0.010..0.029 rows=16 loops=1)"
" -> CTE Scan on temp_data (cost=0.00..1937742625.00 rows=9100000 width=28) (actual time=0.509..59652.279 rows=42449 loops=1)"
" SubPlan 2"
" -> Limit (cost=0.00..212.92 rows=1 width=4) (actual time=1.381..1.381 rows=1 loops=42449)"
" -> Seq Scan on token (cost=0.00..425.84 rows=2 width=4) (actual time=1.372..1.372 rows=1 loops=42449)"
" Filter: ((word)::text = temp_data.word)"
" Rows Removed by Filter: 10384"
"Planning Time: 0.202 ms"
"Execution Time: 59876.350 ms"
对包含 30 个文档的数据集进行解释分析:
"Insert on token_count (cost=1023803.22..6625550803.23 rows=9100000 width=28) (actual time=189910.438..189910.439 rows=0 loops=1)"
" CTE temp_data"
" -> Result (cost=0.00..1023803.22 rows=9100000 width=44) (actual time=0.191..2018.758 rows=92168 loops=1)"
" -> ProjectSet (cost=0.00..45553.23 rows=9100000 width=36) (actual time=0.189..1919.726 rows=92168 loops=1)"
" -> Seq Scan on wikitable (cost=0.00..19.10 rows=910 width=4) (actual time=0.013..0.053 rows=31 loops=1)"
" -> CTE Scan on temp_data (cost=0.00..6624527000.00 rows=9100000 width=28) (actual time=1.009..189412.022 rows=92168 loops=1)"
" SubPlan 2"
" -> Limit (cost=0.00..727.95 rows=1 width=4) (actual time=2.029..2.029 rows=1 loops=92168)"
" -> Seq Scan on token (cost=0.00..727.95 rows=1 width=4) (actual time=2.020..2.020 rows=1 loops=92168)"
" Filter: ((word)::text = temp_data.word)"
" Rows Removed by Filter: 16463"
"Planning Time: 0.234 ms"
"Execution Time: 189913.688 ms"
这是一个不使用 ts_stat 来获取字数的演示。
相反,它使用横向连接来取消嵌套 ts_vector。
create table documents ( document_id serial primary key, title varchar(30) not null, body text not null ); insert into documents (title, body) values ('dogs', 'the dog barked at the cat, but the cat ignored her.') , ('cats', 'cats kill more birds than dogs kill cats') create table tokens ( token_id serial primary key, word varchar(30), df int ) insert into tokens (word, df) SELECT word, ndoc FROM ts_stat('select to_tsvector(''english'', body) from documents');
select * from tokens order by df desc
token_id | word | df -------: | :---- | -: 3 | dog | 2 4 | cat | 2 1 | kill | 1 2 | ignor | 1 5 | bird | 1 6 | bark | 1
create table token_counts ( document_id int, token_id int, tf int, primary key (document_id, token_id), foreign key (document_id) references documents(document_id), foreign key (token_id) references tokens(token_id) );
INSERT INTO token_counts ( document_id, token_id, tf ) select doc.document_id, tok.token_id, lex.total from documents as doc cross join lateral ( select lexeme, cardinality(positions) as total from unnest(to_tsvector('english', doc.body)) as tsvector ) as lex inner join tokens as tok on tok.word = lex.lexeme;
select title, word, tf from token_counts cnt join documents doc using(document_id) join tokens tok using(token_id) order by document_id, token_id
title | word | tf |
---|---|---|
dogs | ignor | 1 |
dogs | dog | 1 |
dogs | cat | 2 |
dogs | bark | 1 |
cats | kill | 2 |
cats | dog | 1 |
cats | cat | 2 |
cats | bird | 1 |
演示 db<>fiddle here