如何在 PostgreSQL 中查找重复记录
How to find duplicate records in PostgreSQL
我有一个名为 "user_links" 的 PostgreSQL 数据库 table,它目前允许以下重复字段:
year, user_id, sid, cid
唯一约束当前是第一个名为 "id" 的字段,但是我现在希望添加一个约束以确保 year
、user_id
、sid
和 cid
都是唯一的,但我无法应用约束,因为已经存在违反此约束的重复值。
有没有办法找到所有重复项?
基本思路是使用带计数聚合的嵌套查询:
select * from yourTable ou
where (select count(*) from yourTable inr
where inr.sid = ou.sid) > 1
您可以调整内部查询中的 where 子句以缩小搜索范围。
评论中提到的还有另一个很好的解决方案,(但不是每个人都读过):
select Column1, Column2, count(*)
from yourTable
group by Column1, Column2
HAVING count(*) > 1
或更短:
SELECT (yourTable.*)::text, count(*)
FROM yourTable
GROUP BY yourTable.*
HAVING count(*) > 1
您可以在将重复的字段上加入相同的 table,然后在 id 字段上反加入。 Select 来自第一个 table 别名 (tn1) 的 id 字段,然后在第二个 table 别名的 id 字段上使用 array_agg 函数。最后,为了使 array_agg 函数正常工作,您将按 tn1.id 字段对结果进行分组。这将生成一个结果集,其中包含记录的 ID 和符合连接条件的所有 ID 的数组。
select tn1.id,
array_agg(tn2.id) as duplicate_entries,
from table_name tn1 join table_name tn2 on
tn1.year = tn2.year
and tn1.sid = tn2.sid
and tn1.user_id = tn2.user_id
and tn1.cid = tn2.cid
and tn1.id <> tn2.id
group by tn1.id;
显然,对于一个 id,将在 duplicate_entries 数组中的 id 也会在结果集中有自己的条目。您将不得不使用此结果集来决定要将哪个 ID 作为 'truth.' 不应删除的记录的来源。也许你可以这样做:
with dupe_set as (
select tn1.id,
array_agg(tn2.id) as duplicate_entries,
from table_name tn1 join table_name tn2 on
tn1.year = tn2.year
and tn1.sid = tn2.sid
and tn1.user_id = tn2.user_id
and tn1.cid = tn2.cid
and tn1.id <> tn2.id
group by tn1.id
order by tn1.id asc)
select ds.id from dupe_set ds where not exists
(select de from unnest(ds.duplicate_entries) as de where de < ds.id)
Selects 具有重复的 ID 的最小数量(假设 ID 正在增加 int PK)。这些将是您随身携带的 ID。
来自“Find duplicate rows with PostgreSQL”的智能解决方案:
select * from (
SELECT id,
ROW_NUMBER() OVER(PARTITION BY column1, column2 ORDER BY id asc) AS Row
FROM tbl
) dups
where
dups.Row > 1
为了方便起见,我假设您希望仅对列 year 应用唯一约束,并且主键是一个名为 id 的列。
为了找到重复值,您应该 运行、
SELECT year, COUNT(id)
FROM YOUR_TABLE
GROUP BY year
HAVING COUNT(id) > 1
ORDER BY COUNT(id);
使用上面的 sql 语句,您会得到一个 table,其中包含 table 中所有重复的年份。为了删除除了最新的重复项之外的所有重复项,您应该使用上面的sql语句。
DELETE
FROM YOUR_TABLE A USING YOUR_TABLE_AGAIN B
WHERE A.year=B.year AND A.id<B.id;
在您的情况下,由于限制,您需要删除重复的记录。
- 找到重复的行
- 按
created_at
日期组织它们 - 在这种情况下,我保留最旧的
- 删除带有
USING
的记录以过滤正确的行
WITH duplicated AS (
SELECT id,
count(*)
FROM products
GROUP BY id
HAVING count(*) > 1),
ordered AS (
SELECT p.id,
created_at,
rank() OVER (partition BY p.id ORDER BY p.created_at) AS rnk
FROM products o
JOIN duplicated d ON d.id = p.id ),
products_to_delete AS (
SELECT id,
created_at
FROM ordered
WHERE rnk = 2
)
DELETE
FROM products
USING products_to_delete
WHERE products.id = products_to_delete.id
AND products.created_at = products_to_delete.created_at;
受 Sandro Wiggers 的启发,我做了一些类似于
的事情
WITH ordered AS (
SELECT id,year, user_id, sid, cid,
rank() OVER (PARTITION BY year, user_id, sid, cid ORDER BY id) AS rnk
FROM user_links
),
to_delete AS (
SELECT id
FROM ordered
WHERE rnk > 1
)
DELETE
FROM user_links
USING to_delete
WHERE user_link.id = to_delete.id;
如果要测试的话,稍微改一下:
WITH ordered AS (
SELECT id,year, user_id, sid, cid,
rank() OVER (PARTITION BY year, user_id, sid, cid ORDER BY id) AS rnk
FROM user_links
),
to_delete AS (
SELECT id,year,user_id,sid, cid
FROM ordered
WHERE rnk > 1
)
SELECT * FROM to_delete;
这将概述将要删除的内容(当 运行删除,但不需要它们)
begin;
create table user_links(id serial,year bigint, user_id bigint, sid bigint, cid bigint);
insert into user_links(year, user_id, sid, cid) values (null,null,null,null),
(null,null,null,null), (null,null,null,null),
(1,2,3,4), (1,2,3,4),
(1,2,3,4),(1,1,3,8),
(1,1,3,9),
(1,null,null,null),(1,null,null,null);
commit;
用 distinct 和 except 设置操作。
(select id, year, user_id, sid, cid from user_links order by 1)
except
select distinct on (year, user_id, sid, cid) id, year, user_id, sid, cid
from user_links order by 1;
except all 也有效。由于 id serial 使所有行都是唯一的。
(select id, year, user_id, sid, cid from user_links order by 1)
except all
select distinct on (year, user_id, sid, cid)
id, year, user_id, sid, cid from user_links order by 1;
到目前为止可以使用 null 和 non-nulls。
删除:
with a as(
(select id, year, user_id, sid, cid from user_links order by 1)
except all
select distinct on (year, user_id, sid, cid)
id, year, user_id, sid, cid from user_links order by 1)
delete from user_links using a where user_links.id = a.id returning *;
我有一个名为 "user_links" 的 PostgreSQL 数据库 table,它目前允许以下重复字段:
year, user_id, sid, cid
唯一约束当前是第一个名为 "id" 的字段,但是我现在希望添加一个约束以确保 year
、user_id
、sid
和 cid
都是唯一的,但我无法应用约束,因为已经存在违反此约束的重复值。
有没有办法找到所有重复项?
基本思路是使用带计数聚合的嵌套查询:
select * from yourTable ou
where (select count(*) from yourTable inr
where inr.sid = ou.sid) > 1
您可以调整内部查询中的 where 子句以缩小搜索范围。
评论中提到的还有另一个很好的解决方案,(但不是每个人都读过):
select Column1, Column2, count(*)
from yourTable
group by Column1, Column2
HAVING count(*) > 1
或更短:
SELECT (yourTable.*)::text, count(*)
FROM yourTable
GROUP BY yourTable.*
HAVING count(*) > 1
您可以在将重复的字段上加入相同的 table,然后在 id 字段上反加入。 Select 来自第一个 table 别名 (tn1) 的 id 字段,然后在第二个 table 别名的 id 字段上使用 array_agg 函数。最后,为了使 array_agg 函数正常工作,您将按 tn1.id 字段对结果进行分组。这将生成一个结果集,其中包含记录的 ID 和符合连接条件的所有 ID 的数组。
select tn1.id,
array_agg(tn2.id) as duplicate_entries,
from table_name tn1 join table_name tn2 on
tn1.year = tn2.year
and tn1.sid = tn2.sid
and tn1.user_id = tn2.user_id
and tn1.cid = tn2.cid
and tn1.id <> tn2.id
group by tn1.id;
显然,对于一个 id,将在 duplicate_entries 数组中的 id 也会在结果集中有自己的条目。您将不得不使用此结果集来决定要将哪个 ID 作为 'truth.' 不应删除的记录的来源。也许你可以这样做:
with dupe_set as (
select tn1.id,
array_agg(tn2.id) as duplicate_entries,
from table_name tn1 join table_name tn2 on
tn1.year = tn2.year
and tn1.sid = tn2.sid
and tn1.user_id = tn2.user_id
and tn1.cid = tn2.cid
and tn1.id <> tn2.id
group by tn1.id
order by tn1.id asc)
select ds.id from dupe_set ds where not exists
(select de from unnest(ds.duplicate_entries) as de where de < ds.id)
Selects 具有重复的 ID 的最小数量(假设 ID 正在增加 int PK)。这些将是您随身携带的 ID。
来自“Find duplicate rows with PostgreSQL”的智能解决方案:
select * from (
SELECT id,
ROW_NUMBER() OVER(PARTITION BY column1, column2 ORDER BY id asc) AS Row
FROM tbl
) dups
where
dups.Row > 1
为了方便起见,我假设您希望仅对列 year 应用唯一约束,并且主键是一个名为 id 的列。
为了找到重复值,您应该 运行、
SELECT year, COUNT(id)
FROM YOUR_TABLE
GROUP BY year
HAVING COUNT(id) > 1
ORDER BY COUNT(id);
使用上面的 sql 语句,您会得到一个 table,其中包含 table 中所有重复的年份。为了删除除了最新的重复项之外的所有重复项,您应该使用上面的sql语句。
DELETE
FROM YOUR_TABLE A USING YOUR_TABLE_AGAIN B
WHERE A.year=B.year AND A.id<B.id;
在您的情况下,由于限制,您需要删除重复的记录。
- 找到重复的行
- 按
created_at
日期组织它们 - 在这种情况下,我保留最旧的 - 删除带有
USING
的记录以过滤正确的行
WITH duplicated AS (
SELECT id,
count(*)
FROM products
GROUP BY id
HAVING count(*) > 1),
ordered AS (
SELECT p.id,
created_at,
rank() OVER (partition BY p.id ORDER BY p.created_at) AS rnk
FROM products o
JOIN duplicated d ON d.id = p.id ),
products_to_delete AS (
SELECT id,
created_at
FROM ordered
WHERE rnk = 2
)
DELETE
FROM products
USING products_to_delete
WHERE products.id = products_to_delete.id
AND products.created_at = products_to_delete.created_at;
受 Sandro Wiggers 的启发,我做了一些类似于
的事情WITH ordered AS (
SELECT id,year, user_id, sid, cid,
rank() OVER (PARTITION BY year, user_id, sid, cid ORDER BY id) AS rnk
FROM user_links
),
to_delete AS (
SELECT id
FROM ordered
WHERE rnk > 1
)
DELETE
FROM user_links
USING to_delete
WHERE user_link.id = to_delete.id;
如果要测试的话,稍微改一下:
WITH ordered AS (
SELECT id,year, user_id, sid, cid,
rank() OVER (PARTITION BY year, user_id, sid, cid ORDER BY id) AS rnk
FROM user_links
),
to_delete AS (
SELECT id,year,user_id,sid, cid
FROM ordered
WHERE rnk > 1
)
SELECT * FROM to_delete;
这将概述将要删除的内容(当 运行删除,但不需要它们)
begin;
create table user_links(id serial,year bigint, user_id bigint, sid bigint, cid bigint);
insert into user_links(year, user_id, sid, cid) values (null,null,null,null),
(null,null,null,null), (null,null,null,null),
(1,2,3,4), (1,2,3,4),
(1,2,3,4),(1,1,3,8),
(1,1,3,9),
(1,null,null,null),(1,null,null,null);
commit;
用 distinct 和 except 设置操作。
(select id, year, user_id, sid, cid from user_links order by 1)
except
select distinct on (year, user_id, sid, cid) id, year, user_id, sid, cid
from user_links order by 1;
except all 也有效。由于 id serial 使所有行都是唯一的。
(select id, year, user_id, sid, cid from user_links order by 1)
except all
select distinct on (year, user_id, sid, cid)
id, year, user_id, sid, cid from user_links order by 1;
到目前为止可以使用 null 和 non-nulls。
删除:
with a as(
(select id, year, user_id, sid, cid from user_links order by 1)
except all
select distinct on (year, user_id, sid, cid)
id, year, user_id, sid, cid from user_links order by 1)
delete from user_links using a where user_links.id = a.id returning *;