从 tsvector 获取按位置信息而不是字母顺序排序的数组?
from tsvector get array sorted by positional information instead of alphabetical order?
Postgresql tsvector 类型包含按字母顺序排序的词位以及位置信息。
如何从 tsvector 中获取已排序的词位数组,按位置信息而不是字母顺序排序?
没有现成的功能,但您可以编写自定义函数来获得您想要的功能,例如:
create or replace function reorder_tsvector(vector tsvector)
returns text[] language sql as $$
select array_agg(concat(pos, ':', ar[1]) order by pos::int)
from (
select string_to_array(elem, ':') ar
from unnest(string_to_array(vector::text, ' ')) elem
) s,
unnest(string_to_array(ar[2], ',')) pos
$$;
select to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats');
to_tsvector
-----------------------------------------------------
'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
(1 row)
select reorder_tsvector(to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats'));
reorder_tsvector
-------------------------------------------------------------
{2:'fat',3:'cat',4:'sat',7:'mat',9:'ate',11:'fat',12:'rat'}
(1 row)
更新。我的 OP 函数版本:
create or replace function tokenize_orig_1(p_string text, p_dictionary regconfig)
returns text[] language sql as $$
select array_agg(lexeme order by positions)
from unnest(to_tsvector(, )) arr
$$;
select tokenize_orig_1('a fat cat sat on a mat - it ate a fat rats', 'English');
tokenize_orig_1
---------------------------
{fat,cat,sat,mat,ate,rat}
(1 row)
或重复:
create or replace function tokenize_orig_2(p_string text, p_dictionary regconfig)
returns text[] language sql as $$
select array_agg(lexeme order by pos)
from unnest(to_tsvector(, )) arr,
unnest(positions) pos
$$;
select tokenize_orig_2('a fat cat sat on a mat - it ate a fat rats', 'English');
tokenize_orig_2
-------------------------------
{fat,cat,sat,mat,ate,fat,rat}
(1 row)
这个功能好像可以,有更好的方法请指教
CREATE OR REPLACE FUNCTION public.tokenize_orig(
p_string text,
p_dictionary regconfig)
RETURNS text[] AS
$BODY$
/* This function turns documents into array of lexemes, keeping original order of lexemes. */
select array_agg(lexeme)
from
(
select (arr).lexeme as lexeme
from
(select unnest(tsv) arr
from to_tsvector(, ) as tsv) arr
where array_length((arr).positions, 1) > 0
order by (arr).positions
) as qry
$BODY$
LANGUAGE sql IMMUTABLE STRICT
COST 100;
Postgresql tsvector 类型包含按字母顺序排序的词位以及位置信息。 如何从 tsvector 中获取已排序的词位数组,按位置信息而不是字母顺序排序?
没有现成的功能,但您可以编写自定义函数来获得您想要的功能,例如:
create or replace function reorder_tsvector(vector tsvector)
returns text[] language sql as $$
select array_agg(concat(pos, ':', ar[1]) order by pos::int)
from (
select string_to_array(elem, ':') ar
from unnest(string_to_array(vector::text, ' ')) elem
) s,
unnest(string_to_array(ar[2], ',')) pos
$$;
select to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats');
to_tsvector
-----------------------------------------------------
'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
(1 row)
select reorder_tsvector(to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats'));
reorder_tsvector
-------------------------------------------------------------
{2:'fat',3:'cat',4:'sat',7:'mat',9:'ate',11:'fat',12:'rat'}
(1 row)
更新。我的 OP 函数版本:
create or replace function tokenize_orig_1(p_string text, p_dictionary regconfig)
returns text[] language sql as $$
select array_agg(lexeme order by positions)
from unnest(to_tsvector(, )) arr
$$;
select tokenize_orig_1('a fat cat sat on a mat - it ate a fat rats', 'English');
tokenize_orig_1
---------------------------
{fat,cat,sat,mat,ate,rat}
(1 row)
或重复:
create or replace function tokenize_orig_2(p_string text, p_dictionary regconfig)
returns text[] language sql as $$
select array_agg(lexeme order by pos)
from unnest(to_tsvector(, )) arr,
unnest(positions) pos
$$;
select tokenize_orig_2('a fat cat sat on a mat - it ate a fat rats', 'English');
tokenize_orig_2
-------------------------------
{fat,cat,sat,mat,ate,fat,rat}
(1 row)
这个功能好像可以,有更好的方法请指教
CREATE OR REPLACE FUNCTION public.tokenize_orig(
p_string text,
p_dictionary regconfig)
RETURNS text[] AS
$BODY$
/* This function turns documents into array of lexemes, keeping original order of lexemes. */
select array_agg(lexeme)
from
(
select (arr).lexeme as lexeme
from
(select unnest(tsv) arr
from to_tsvector(, ) as tsv) arr
where array_length((arr).positions, 1) > 0
order by (arr).positions
) as qry
$BODY$
LANGUAGE sql IMMUTABLE STRICT
COST 100;