复杂 SQL 用于在大型 table 上进行更新和删除
Complicated SQL for update and delete on a large table
我在 PostgreSQL 中遇到了一个复杂的 SQL 问题SQL。
假设我有一个名为“selling_prices”的大型 table。它包含大约 1900 万行。我想删除一些重复的行并更新一些数据。这是table结构:
seq
customer_co_cd
item_sku
seliing_tanka_rate
updatedate
1
1414343
sku001
0.4
2021-01-18 14:34:48
2
1414343
sku001
0.4
2021-01-18 14:34:48
3
1414343
sku001
0.4
2021-01-16 01:34:48
4
1512333
sku002
0.2
2021-01-16 01:34:48
5
1512333
sku002
0.5
2021-01-16 01:34:48
等等....
条件一:如果customer_co_cd且item_sku 和 selling_tanka_rate 是一样的,将最新的更新日期更新为“2021/11/12”并删除其他数据。
在 SQL table 之后应该是这样的:删除 seq(2,3) 并更新 seq 1
seq
customer_co_cd
item_sku
seliing_tanka_rate
updatedate
1
1414343
sku001
0.4
2021-11-12 00:00:00
条件2:如果(customer_co_cd和item_sku) 相同而 selling_tanka_rate 不同则获取数据为组
customer_co_cd
item_sku
count
1512333
sku002
2
我尝试了一些使用 group by 的查询,但速度很慢...
SELECT customer_co_cd, item_sku, COUNT(*)
FROM selling_prices
GROUP BY customer_co_cd,item_sku
HAVING COUNT(*) > 1
我不知道如何查询条件 1。还有什么是获取条件 2 的有效方法。请记住,大约有 1900 万 数据。
我应该创建一个脚本还是有一个我可以使用的有效查询。
这应该可以满足您的需求 (result here)
select * from t1 order by seq;
-- Update first
with t2 as (
select
row_number() over (partition by customer_co_cd,item_sku,seliing_tanka_rate order by customer_co_cd,item_sku,seliing_tanka_rate,seq) as rn,
lead(seliing_tanka_rate) over (partition by customer_co_cd,item_sku order by customer_co_cd,item_sku,seq) as lead,
*
from t1)
update t1
set updatedate = '20211112'
from t2
where t2.seq = t1.seq and t2.rn = 1
and t2.seliing_tanka_rate = t2.lead;
-- delete to keep the wanted records
with t2 as (select row_number() over (partition by customer_co_cd,item_sku,seliing_tanka_rate order by customer_co_cd,item_sku,seliing_tanka_rate,seq) as rn,* from t1)
delete
from t1
where seq in (select seq from t2 where rn > 1);
select * from t1 order by seq;
-- Condition 2
with t2 as (
select *,
lead(customer_co_cd) over (partition by customer_co_cd,item_sku) as co_cd,
lead(item_sku) over (partition by customer_co_cd,item_sku) as sku,
lead(seliing_tanka_rate) over (partition by customer_co_cd,item_sku) as rate
from t1
)
select customer_co_cd,item_sku,
count(*) filter (where customer_co_cd = t2.co_cd and item_sku = t2.sku and seliing_tanka_rate <> t2.rate) + 1 as count
from t2
group by customer_co_cd,item_sku
having count(*) filter (where customer_co_cd = t2.co_cd and item_sku = t2.sku and seliing_tanka_rate <> t2.rate) + 1 > 1
我想 Philippe 已经回答了你的问题,但我会添加一些略有不同的方法。
create temporary table orders (
seq serial primary key,
customer_co_cd int,
item_sku varchar,
selling_tanka_rate float,
updated_at date
);
insert into orders (seq, customer_co_cd, item_sku, selling_tanka_rate, updated_at) values
(1 , 1414343, 'sku001', 0.4, '2021-01-18'),
(2 , 1414343, 'sku001', 0.4, '2021-01-18'),
(3 , 1414343, 'sku001', 0.4, '2021-01-16'),
(4 , 1512333, 'sku002', 0.2, '2021-01-16'),
(5 , 1512333, 'sku002', 0.5, '2021-01-16')
;
with ranked_orders as (
select
orders.*,
row_number() over(partition by customer_co_cd, item_sku, selling_tanka_rate order by updated_at DESC, seq) as recent_updated_at
from orders
)
update orders
set updated_at = '2021-11-12'
from ranked_orders
where
orders.seq = ranked_orders.seq AND
ranked_orders.recent_updated_at = 1
;
select * from orders order by seq ASC;
/*
seq | customer_co_cd | item_sku | selling_tanka_rate | updated_at
-----+----------------+----------+--------------------+------------
1 | 1414343 | sku001 | 0.4 | 2021-11-12
2 | 1414343 | sku001 | 0.4 | 2021-01-18
3 | 1414343 | sku001 | 0.4 | 2021-01-16
4 | 1512333 | sku002 | 0.2 | 2021-11-12
5 | 1512333 | sku002 | 0.5 | 2021-11-12
*/
delete from orders
where orders.updated_at <> '2021-11-12'
select * from orders order by seq ASC;
/*
seq | customer_co_cd | item_sku | selling_tanka_rate | updated_at
-----+----------------+----------+--------------------+------------
1 | 1414343 | sku001 | 0.4 | 2021-11-12
4 | 1512333 | sku002 | 0.2 | 2021-11-12
5 | 1512333 | sku002 | 0.5 | 2021-11-12
*/
select t.* from
( select customer_co_cd,
item_sku,
count(distinct selling_tanka_rate) as count
from orders
group by (customer_co_cd, item_sku)
) as t
where t.count > 1 // you may want to remove this. Not sure of your exact requirements.
/*
customer_co_cd | item_sku | count
----------------+----------+-------
1512333 | sku002 | 2
*/
我在 PostgreSQL 中遇到了一个复杂的 SQL 问题SQL。
假设我有一个名为“selling_prices”的大型 table。它包含大约 1900 万行。我想删除一些重复的行并更新一些数据。这是table结构:
seq | customer_co_cd | item_sku | seliing_tanka_rate | updatedate |
---|---|---|---|---|
1 | 1414343 | sku001 | 0.4 | 2021-01-18 14:34:48 |
2 | 1414343 | sku001 | 0.4 | 2021-01-18 14:34:48 |
3 | 1414343 | sku001 | 0.4 | 2021-01-16 01:34:48 |
4 | 1512333 | sku002 | 0.2 | 2021-01-16 01:34:48 |
5 | 1512333 | sku002 | 0.5 | 2021-01-16 01:34:48 |
等等....
条件一:如果customer_co_cd且item_sku 和 selling_tanka_rate 是一样的,将最新的更新日期更新为“2021/11/12”并删除其他数据。
在 SQL table 之后应该是这样的:删除 seq(2,3) 并更新 seq 1
seq | customer_co_cd | item_sku | seliing_tanka_rate | updatedate |
---|---|---|---|---|
1 | 1414343 | sku001 | 0.4 | 2021-11-12 00:00:00 |
条件2:如果(customer_co_cd和item_sku) 相同而 selling_tanka_rate 不同则获取数据为组
customer_co_cd | item_sku | count |
---|---|---|
1512333 | sku002 | 2 |
我尝试了一些使用 group by 的查询,但速度很慢...
SELECT customer_co_cd, item_sku, COUNT(*)
FROM selling_prices
GROUP BY customer_co_cd,item_sku
HAVING COUNT(*) > 1
我不知道如何查询条件 1。还有什么是获取条件 2 的有效方法。请记住,大约有 1900 万 数据。
我应该创建一个脚本还是有一个我可以使用的有效查询。
这应该可以满足您的需求 (result here)
select * from t1 order by seq;
-- Update first
with t2 as (
select
row_number() over (partition by customer_co_cd,item_sku,seliing_tanka_rate order by customer_co_cd,item_sku,seliing_tanka_rate,seq) as rn,
lead(seliing_tanka_rate) over (partition by customer_co_cd,item_sku order by customer_co_cd,item_sku,seq) as lead,
*
from t1)
update t1
set updatedate = '20211112'
from t2
where t2.seq = t1.seq and t2.rn = 1
and t2.seliing_tanka_rate = t2.lead;
-- delete to keep the wanted records
with t2 as (select row_number() over (partition by customer_co_cd,item_sku,seliing_tanka_rate order by customer_co_cd,item_sku,seliing_tanka_rate,seq) as rn,* from t1)
delete
from t1
where seq in (select seq from t2 where rn > 1);
select * from t1 order by seq;
-- Condition 2
with t2 as (
select *,
lead(customer_co_cd) over (partition by customer_co_cd,item_sku) as co_cd,
lead(item_sku) over (partition by customer_co_cd,item_sku) as sku,
lead(seliing_tanka_rate) over (partition by customer_co_cd,item_sku) as rate
from t1
)
select customer_co_cd,item_sku,
count(*) filter (where customer_co_cd = t2.co_cd and item_sku = t2.sku and seliing_tanka_rate <> t2.rate) + 1 as count
from t2
group by customer_co_cd,item_sku
having count(*) filter (where customer_co_cd = t2.co_cd and item_sku = t2.sku and seliing_tanka_rate <> t2.rate) + 1 > 1
我想 Philippe 已经回答了你的问题,但我会添加一些略有不同的方法。
create temporary table orders (
seq serial primary key,
customer_co_cd int,
item_sku varchar,
selling_tanka_rate float,
updated_at date
);
insert into orders (seq, customer_co_cd, item_sku, selling_tanka_rate, updated_at) values
(1 , 1414343, 'sku001', 0.4, '2021-01-18'),
(2 , 1414343, 'sku001', 0.4, '2021-01-18'),
(3 , 1414343, 'sku001', 0.4, '2021-01-16'),
(4 , 1512333, 'sku002', 0.2, '2021-01-16'),
(5 , 1512333, 'sku002', 0.5, '2021-01-16')
;
with ranked_orders as (
select
orders.*,
row_number() over(partition by customer_co_cd, item_sku, selling_tanka_rate order by updated_at DESC, seq) as recent_updated_at
from orders
)
update orders
set updated_at = '2021-11-12'
from ranked_orders
where
orders.seq = ranked_orders.seq AND
ranked_orders.recent_updated_at = 1
;
select * from orders order by seq ASC;
/*
seq | customer_co_cd | item_sku | selling_tanka_rate | updated_at
-----+----------------+----------+--------------------+------------
1 | 1414343 | sku001 | 0.4 | 2021-11-12
2 | 1414343 | sku001 | 0.4 | 2021-01-18
3 | 1414343 | sku001 | 0.4 | 2021-01-16
4 | 1512333 | sku002 | 0.2 | 2021-11-12
5 | 1512333 | sku002 | 0.5 | 2021-11-12
*/
delete from orders
where orders.updated_at <> '2021-11-12'
select * from orders order by seq ASC;
/*
seq | customer_co_cd | item_sku | selling_tanka_rate | updated_at
-----+----------------+----------+--------------------+------------
1 | 1414343 | sku001 | 0.4 | 2021-11-12
4 | 1512333 | sku002 | 0.2 | 2021-11-12
5 | 1512333 | sku002 | 0.5 | 2021-11-12
*/
select t.* from
( select customer_co_cd,
item_sku,
count(distinct selling_tanka_rate) as count
from orders
group by (customer_co_cd, item_sku)
) as t
where t.count > 1 // you may want to remove this. Not sure of your exact requirements.
/*
customer_co_cd | item_sku | count
----------------+----------+-------
1512333 | sku002 | 2
*/