PostgreSQL 中固定大小 JSONB 数组的聚合
Aggregation on fixed size JSONB array in PostgreSQL
我正在努力对 PostgreSQL 数据库中的 JSONB 字段进行聚合。这可能更容易用一个例子来解释,所以如果创建并填充一个名为 analysis
的 table,其中包含 2 列(id
和 analysis
),如下所示:-
create table analysis (
id serial primary key,
analysis jsonb
);
insert into analysis
(id, analysis) values
(1, '{"category" : "news", "results" : [1, 2, 3, 4, 5 , 6, 7, 8, 9, 10, 11, 12, 13, 14, null, null]}'),
(2, '{"category" : "news", "results" : [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, null, 26]}'),
(3, '{"category" : "news", "results" : [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]}'),
(4, '{"category" : "sport", "results" : [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66]}'),
(5, '{"category" : "sport", "results" : [71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86]}'),
(6, '{"category" : "weather", "results" : [91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106]}');
如您所见,analysis
JSONB 字段始终包含 2 个属性 category
和 results
。结果属性将始终包含一个大小为 16 的固定长度数组。我使用了各种函数,例如 jsonb_array_elements
但我正在尝试执行以下操作:-
- 分组分析->'category'
- 每个数组元素的平均值
When I want is a statement to return 3 rows grouped by category (i.e. news
, sport
and weather
) and a 16 fixed length array 包含平均值.更复杂的是,如果数组中有 null
,那么我们应该忽略它们(即我们不是简单地按行数求和和平均)。结果应如下所示:-
category | analysis_average
-----------+--------------------------------------------------------------------------------------------------------------
"news" | [14.33, 15.33, 16.33, 17.33, 18.33, 19.33, 20.33, 21.33, 22.33, 23.33, 24.33, 25.33, 26.33, 27.33, 45, 36]
"sport" | [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]
"weather" | [91, 92, 93, 94, 95, 96, 97, 98, 99, 00, 101, 102, 103, 104, 105, 106]
注意: 注意第一行最后 2 个数组项中的 45
和 36
,这表明忽略了 nulls
s .
我考虑过创建一个将数组分解为 16 列的视图,即
create view analysis_view as
select a.*,
(a.analysis->'results'->>0)::int as result0,
(a.analysis->'results'->>1)::int as result1
/* ... etc for all 16 array entries .. */
from analysis a;
这对我来说似乎非常不雅,并且首先消除了使用数组的优势,但可能可以使用这种方法将一些东西组合在一起。
任何指点或提示将不胜感激!
性能在这里也很重要,所以性能越高越好!
因为数组的长度始终相同,您可以使用 generate_series
而不是自己键入每个数组元素的索引。您与生成的系列进行 CROSS JOIN,以便将索引应用于每个类别,您可以从数组中获取位置 s 处的每个元素。然后它只是使用 GROUP BY 聚合数据。
查询则变为:
SELECT category, array_agg(val ORDER BY s) analysis_average
FROM (
SELECT analysis->'category' category, s, AVG((analysis->'results'->>s)::numeric) val
FROM analysis
CROSS JOIN generate_series(0, 15) s
GROUP BY category,s
) q
GROUP BY category
15 在这种情况下是数组 (16-1) 的最后一个索引。
这适用于任何长度的数组
select category, array_agg(average order by subscript) as average
from (
select
a.analysis->>'category' category,
subscript,
avg(v)::numeric(5,2) as average
from
analysis a,
lateral unnest(
array(select jsonb_array_elements_text(analysis->'results')::int)
) with ordinality s(v,subscript)
group by 1, 2
) s
group by category
;
category | average
----------+----------------------------------------------------------------------------------------------------------
news | {14.33,15.33,16.33,17.33,18.33,19.33,20.33,21.33,22.33,23.33,24.33,25.33,26.33,27.33,45.00,36.00}
sport | {61.00,62.00,63.00,64.00,65.00,66.00,67.00,68.00,69.00,70.00,71.00,72.00,73.00,74.00,75.00,76.00}
weather | {91.00,92.00,93.00,94.00,95.00,96.00,97.00,98.00,99.00,100.00,101.00,102.00,103.00,104.00,105.00,106.00}
可以用更传统的方式完成,比如
select
(t.analysis->'category')::varchar,
array_math_avg(array(select jsonb_array_elements_text(t.analysis->'results')::int))::numeric(9,2)[]
from
analysis t
group by 1 order by 1;
但我们需要做一些准备:
create type t_array_math_agg as(
c int[],
a numeric[]
);
create or replace function array_math_sum_f(in t_array_math_agg, in numeric[]) returns t_array_math_agg as $$
declare
r t_array_math_agg;
i int;
begin
if is null then
return ;
end if;
r := ;
for i in array_lower(,1)..array_upper(,1) loop
if coalesce(r.a[i],[i]) is null then
r.a[i] := null;
else
r.a[i] := coalesce(r.a[i],0) + coalesce([i],0);
r.c[i] := coalesce(r.c[i],0) + 1;
end if;
end loop;
return r;
end; $$ immutable language plpgsql;
create or replace function array_math_avg_final(in t_array_math_agg) returns numeric[] as $$
declare
r numeric[];
i int;
begin
if array_lower(.a, 1) is null then
return null;
end if;
for i in array_lower(.a,1)..array_upper(.a,1) loop
r[i] := .a[i] / .c[i];
end loop;
return r;
end; $$ immutable language plpgsql;
create aggregate array_math_avg(numeric[]) (
sfunc=array_math_sum_f,
finalfunc=array_math_avg_final,
stype=t_array_math_agg,
initcond='({},{})'
);
我正在努力对 PostgreSQL 数据库中的 JSONB 字段进行聚合。这可能更容易用一个例子来解释,所以如果创建并填充一个名为 analysis
的 table,其中包含 2 列(id
和 analysis
),如下所示:-
create table analysis (
id serial primary key,
analysis jsonb
);
insert into analysis
(id, analysis) values
(1, '{"category" : "news", "results" : [1, 2, 3, 4, 5 , 6, 7, 8, 9, 10, 11, 12, 13, 14, null, null]}'),
(2, '{"category" : "news", "results" : [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, null, 26]}'),
(3, '{"category" : "news", "results" : [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]}'),
(4, '{"category" : "sport", "results" : [51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66]}'),
(5, '{"category" : "sport", "results" : [71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86]}'),
(6, '{"category" : "weather", "results" : [91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106]}');
如您所见,analysis
JSONB 字段始终包含 2 个属性 category
和 results
。结果属性将始终包含一个大小为 16 的固定长度数组。我使用了各种函数,例如 jsonb_array_elements
但我正在尝试执行以下操作:-
- 分组分析->'category'
- 每个数组元素的平均值
When I want is a statement to return 3 rows grouped by category (i.e. news
, sport
and weather
) and a 16 fixed length array 包含平均值.更复杂的是,如果数组中有 null
,那么我们应该忽略它们(即我们不是简单地按行数求和和平均)。结果应如下所示:-
category | analysis_average
-----------+--------------------------------------------------------------------------------------------------------------
"news" | [14.33, 15.33, 16.33, 17.33, 18.33, 19.33, 20.33, 21.33, 22.33, 23.33, 24.33, 25.33, 26.33, 27.33, 45, 36]
"sport" | [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]
"weather" | [91, 92, 93, 94, 95, 96, 97, 98, 99, 00, 101, 102, 103, 104, 105, 106]
注意: 注意第一行最后 2 个数组项中的 45
和 36
,这表明忽略了 nulls
s .
我考虑过创建一个将数组分解为 16 列的视图,即
create view analysis_view as
select a.*,
(a.analysis->'results'->>0)::int as result0,
(a.analysis->'results'->>1)::int as result1
/* ... etc for all 16 array entries .. */
from analysis a;
这对我来说似乎非常不雅,并且首先消除了使用数组的优势,但可能可以使用这种方法将一些东西组合在一起。
任何指点或提示将不胜感激!
性能在这里也很重要,所以性能越高越好!
因为数组的长度始终相同,您可以使用 generate_series
而不是自己键入每个数组元素的索引。您与生成的系列进行 CROSS JOIN,以便将索引应用于每个类别,您可以从数组中获取位置 s 处的每个元素。然后它只是使用 GROUP BY 聚合数据。
查询则变为:
SELECT category, array_agg(val ORDER BY s) analysis_average
FROM (
SELECT analysis->'category' category, s, AVG((analysis->'results'->>s)::numeric) val
FROM analysis
CROSS JOIN generate_series(0, 15) s
GROUP BY category,s
) q
GROUP BY category
15 在这种情况下是数组 (16-1) 的最后一个索引。
这适用于任何长度的数组
select category, array_agg(average order by subscript) as average
from (
select
a.analysis->>'category' category,
subscript,
avg(v)::numeric(5,2) as average
from
analysis a,
lateral unnest(
array(select jsonb_array_elements_text(analysis->'results')::int)
) with ordinality s(v,subscript)
group by 1, 2
) s
group by category
;
category | average
----------+----------------------------------------------------------------------------------------------------------
news | {14.33,15.33,16.33,17.33,18.33,19.33,20.33,21.33,22.33,23.33,24.33,25.33,26.33,27.33,45.00,36.00}
sport | {61.00,62.00,63.00,64.00,65.00,66.00,67.00,68.00,69.00,70.00,71.00,72.00,73.00,74.00,75.00,76.00}
weather | {91.00,92.00,93.00,94.00,95.00,96.00,97.00,98.00,99.00,100.00,101.00,102.00,103.00,104.00,105.00,106.00}
可以用更传统的方式完成,比如
select
(t.analysis->'category')::varchar,
array_math_avg(array(select jsonb_array_elements_text(t.analysis->'results')::int))::numeric(9,2)[]
from
analysis t
group by 1 order by 1;
但我们需要做一些准备:
create type t_array_math_agg as(
c int[],
a numeric[]
);
create or replace function array_math_sum_f(in t_array_math_agg, in numeric[]) returns t_array_math_agg as $$
declare
r t_array_math_agg;
i int;
begin
if is null then
return ;
end if;
r := ;
for i in array_lower(,1)..array_upper(,1) loop
if coalesce(r.a[i],[i]) is null then
r.a[i] := null;
else
r.a[i] := coalesce(r.a[i],0) + coalesce([i],0);
r.c[i] := coalesce(r.c[i],0) + 1;
end if;
end loop;
return r;
end; $$ immutable language plpgsql;
create or replace function array_math_avg_final(in t_array_math_agg) returns numeric[] as $$
declare
r numeric[];
i int;
begin
if array_lower(.a, 1) is null then
return null;
end if;
for i in array_lower(.a,1)..array_upper(.a,1) loop
r[i] := .a[i] / .c[i];
end loop;
return r;
end; $$ immutable language plpgsql;
create aggregate array_math_avg(numeric[]) (
sfunc=array_math_sum_f,
finalfunc=array_math_avg_final,
stype=t_array_math_agg,
initcond='({},{})'
);