大 SQL 请求优化人脸欧氏距离计算
Large SQL Request optimization for Faces Euclidean Distances calculations
我正在计算人脸之间的欧几里德距离,并希望将结果存储在 table。
当前设置:
- 每个人脸都存储在对象 table 中,人脸之间的距离存储在 Faces_distances table.
中
- 对象 table 具有以下列 objects_id、face_encodings、描述
- faces_distancestable有以下列face_from、face_to、距离
在我的数据集中,我有大约 22 231 个人脸对象,这些对象导致 494 217 361 对 人脸 - 虽然我明白它可以除以 2 因为
distance(face_from, face_to) = distance(face_to, face_from)
数据库是 Postgres 12。
下面的请求可以插入还没有计算距离的人脸对(不计算距离),但是执行时间非常非常非常长(4天前就开始了,还没做完)。有没有办法优化它?
'''
-- public.objects definition
-- Drop table
-- DROP TABLE public.objects;
CREATE TABLE public.objects
(
objects_id int4 NOT NULL DEFAULT
nextval('objects_in_image_objects_id_seq'::regclass),
filefullname varchar(2303) NULL,
bbox varchar(255) NULL,
description varchar(255) NULL,
confidence numeric NULL,
analyzer varchar(255) NOT NULL DEFAULT 'object_detector'::character
varying,
analyzer_version int4 NOT NULL DEFAULT 100,
x int4 NULL,
y int4 NULL,
w int4 NULL,
h int4 NULL,
image_id int4 NULL,
derived_from_object int4 NULL,
object_image_filename varchar(2023) NULL,
face_encodings _float8 NULL,
face_id int4 NULL,
face_id_iteration int4 NULL,
text_found varchar NULL COLLATE "C.UTF-8",
CONSTRAINT objects_in_image_pkey PRIMARY KEY (objects_id),
CONSTRAINT objects_in_images FOREIGN KEY (objects_id) REFERENCES
public.objects(objects_id)
);
CREATE TABLE public.face_distances
(
face_from int8 NOT NULL,
face_to int8 NOT NULL,
distance float8 NULL,
CONSTRAINT face_distances_pk PRIMARY KEY (face_from, face_to)
);
-- public.face_distances foreign keys
ALTER TABLE public.face_distances ADD CONSTRAINT face_distances_fk
FOREIGN KEY (face_from) REFERENCES public.objects(objects_id);
ALTER TABLE public.face_distances ADD CONSTRAINT face_distances_fk_1
FOREIGN KEY (face_to) REFERENCES public.objects(objects_id);
索引
CREATE UNIQUE INDEX objects_in_image_pkey ON public.objects USING btree (objects_id);
CREATE INDEX objects_description_column ON public.objects USING btree (description);
CREATE UNIQUE INDEX face_distances_pk ON public.face_distances USING btree (face_from, face_to);
查询添加 table 中不存在的所有面孔。
insert into face_distances (face_from,face_to)
select t1.face_from , t1.face_to
from (
select f_from.objects_id face_from,
f_from.face_encodings face_from_encodings,
f_to.objects_id face_to,
f_to.face_encodings face_to_encodings
from objects f_from,
objects f_to
where f_from.description = 'face'
and f_to.description = 'face' ) as t1
left join face_distances on (
t1.face_from= face_distances.face_from
and t1.face_to = face_distances.face_to )
where face_distances.face_from is null;
试试这个简化的查询。
在我的苹果M1,SQLServer上只用了5分钟,有22231个对象'face',生成了247.097.565对,就是C(22231,2)数。语法与 postgressql 兼容。
优化:加入而不是旧的联合方式,排序函数以删除重复排列 (A,B)=(B,A),
删除了最后一个 [left join face_distance]:重新计算一个空的 table 比检查是否存在要快得多,因为将为每个密钥对启动索引搜索键查找
insert into face_distances (face_from,face_to)
select f1,f2
from(
select --only needed fields here as this will fill temporary tables
f1.objects_id f1
,f2.objects_id f2
,dense_rank()over(order by f1.objects_id) rank1
,rank()over(partition by f2.objects_id order by f1.objects_id) rank2
from objects f1
-- generates all permutations
join objects f2 on f2.objects_id <> f1.objects_id and f2.description = 'face'
where f1.description = 'face'
)a
where rank2 >= rank1 --removes duplicate permutations
我正在计算人脸之间的欧几里德距离,并希望将结果存储在 table。
当前设置:
- 每个人脸都存储在对象 table 中,人脸之间的距离存储在 Faces_distances table. 中
- 对象 table 具有以下列 objects_id、face_encodings、描述
- faces_distancestable有以下列face_from、face_to、距离
在我的数据集中,我有大约 22 231 个人脸对象,这些对象导致 494 217 361 对 人脸 - 虽然我明白它可以除以 2 因为
distance(face_from, face_to) = distance(face_to, face_from)
数据库是 Postgres 12。
下面的请求可以插入还没有计算距离的人脸对(不计算距离),但是执行时间非常非常非常长(4天前就开始了,还没做完)。有没有办法优化它?
'''
-- public.objects definition
-- Drop table
-- DROP TABLE public.objects;
CREATE TABLE public.objects
(
objects_id int4 NOT NULL DEFAULT
nextval('objects_in_image_objects_id_seq'::regclass),
filefullname varchar(2303) NULL,
bbox varchar(255) NULL,
description varchar(255) NULL,
confidence numeric NULL,
analyzer varchar(255) NOT NULL DEFAULT 'object_detector'::character
varying,
analyzer_version int4 NOT NULL DEFAULT 100,
x int4 NULL,
y int4 NULL,
w int4 NULL,
h int4 NULL,
image_id int4 NULL,
derived_from_object int4 NULL,
object_image_filename varchar(2023) NULL,
face_encodings _float8 NULL,
face_id int4 NULL,
face_id_iteration int4 NULL,
text_found varchar NULL COLLATE "C.UTF-8",
CONSTRAINT objects_in_image_pkey PRIMARY KEY (objects_id),
CONSTRAINT objects_in_images FOREIGN KEY (objects_id) REFERENCES
public.objects(objects_id)
);
CREATE TABLE public.face_distances
(
face_from int8 NOT NULL,
face_to int8 NOT NULL,
distance float8 NULL,
CONSTRAINT face_distances_pk PRIMARY KEY (face_from, face_to)
);
-- public.face_distances foreign keys
ALTER TABLE public.face_distances ADD CONSTRAINT face_distances_fk
FOREIGN KEY (face_from) REFERENCES public.objects(objects_id);
ALTER TABLE public.face_distances ADD CONSTRAINT face_distances_fk_1
FOREIGN KEY (face_to) REFERENCES public.objects(objects_id);
索引
CREATE UNIQUE INDEX objects_in_image_pkey ON public.objects USING btree (objects_id);
CREATE INDEX objects_description_column ON public.objects USING btree (description);
CREATE UNIQUE INDEX face_distances_pk ON public.face_distances USING btree (face_from, face_to);
查询添加 table 中不存在的所有面孔。
insert into face_distances (face_from,face_to)
select t1.face_from , t1.face_to
from (
select f_from.objects_id face_from,
f_from.face_encodings face_from_encodings,
f_to.objects_id face_to,
f_to.face_encodings face_to_encodings
from objects f_from,
objects f_to
where f_from.description = 'face'
and f_to.description = 'face' ) as t1
left join face_distances on (
t1.face_from= face_distances.face_from
and t1.face_to = face_distances.face_to )
where face_distances.face_from is null;
试试这个简化的查询。 在我的苹果M1,SQLServer上只用了5分钟,有22231个对象'face',生成了247.097.565对,就是C(22231,2)数。语法与 postgressql 兼容。
优化:加入而不是旧的联合方式,排序函数以删除重复排列 (A,B)=(B,A), 删除了最后一个 [left join face_distance]:重新计算一个空的 table 比检查是否存在要快得多,因为将为每个密钥对启动索引搜索键查找
insert into face_distances (face_from,face_to)
select f1,f2
from(
select --only needed fields here as this will fill temporary tables
f1.objects_id f1
,f2.objects_id f2
,dense_rank()over(order by f1.objects_id) rank1
,rank()over(partition by f2.objects_id order by f1.objects_id) rank2
from objects f1
-- generates all permutations
join objects f2 on f2.objects_id <> f1.objects_id and f2.description = 'face'
where f1.description = 'face'
)a
where rank2 >= rank1 --removes duplicate permutations