SQL(Impala)- 查找包括空值在内的重复值
SQL(Impala)- Finding duplicates values including null values
我有一个名为 baseTable 的 table,其中包含许多列,但是我使用的是 3 个名为 Material_Type、Material_Desc(具有空值)的列,Material_Number 使用 row_num 和分区依据查找重复项。
注意:我需要根据 3 个条件过滤重复项。
- 当Material_Type = Material_Type
- 当Material_Desc = Material_Desc
- 当Material_Number <> Material_Number
样本Table:
Material_Type Material_Desc Material_Number
ABC XYZ 1
ABC XYZ 1
ABC XYZ 2
ABC XYZ 3
DEF IMM 1
LMN NULL 1
LMN NULL 2
我只想在 newTable 中有重复的值,并想删除不同的值。
期望的输出:
Material_Type Material_Desc Material_Number new
ABC XYZ 1 1
ABC XYZ 1 2
ABC XYZ 2 3
ABC XYZ 3 4
LMN NULL 1 1
LMN NULL 2 2
我使用了下面的查询但没有得到预期的输出,因为它不包括列 Material_Desc 中的空值并且没有使用 Null 进行分区并且还创建了不需要的重复记录。
使用的查询:
create table newTable as
with mycte as
(
select
m.MATERIAL_NUMBER
,m.MATERIAL_TYPE
,m.Material_Desc,
row_number() over(partition BY d.MATERIAL_TYPE,d.Material_Desc order by d.MATERIAL_NUMBER) as new
from baseTable m
inner join
(
select MATERIAL_NUMBER,MATERIAL_TYPE,Material_Desc,count(*) from baseTable group by
MATERIAL_NUMBER,MATERIAL_TYPE,Material_Desc having count(*) > 1
) d on d.MATERIAL_NUMBER <> m.MATERIAL_NUMBER and d.MATERIAL_TYPE=m.MATERIAL_TYPE
and d.Material_Desc= m.Material_Desc)
select * from mycte
如有任何帮助,我们将不胜感激。
只需使用 row_number()
和 count(*)
作为 window 函数:
select Material_Type, Material_Desc, Material_Number,
row_number() over (partition by Material_Type, Material_Desc order by Material_Number) as new
from (select t.*,
count(*) over (partition by Material_Type, Material_Desc) as cnt
from t
) t
where cnt > 1;
这适用于您提供的数据,只需计算每种类型和描述的行数。如果您确实需要 material 数字不同,一种方法是 min()
和 max()
:
select Material_Type, Material_Desc, Material_Number,
row_number() over (partition by Material_Type, Material_Desc order by Material_Number) as new
from (select t.*,
min(Material_Number) over (partition by Material_Type, Material_Desc) as min_Material_Number,
max(Material_Number) over (partition by Material_Type, Material_Desc) as max_Material_Number
from t
) t
where min_Material_Number <> max_Material_Number;
我有一个名为 baseTable 的 table,其中包含许多列,但是我使用的是 3 个名为 Material_Type、Material_Desc(具有空值)的列,Material_Number 使用 row_num 和分区依据查找重复项。 注意:我需要根据 3 个条件过滤重复项。
- 当Material_Type = Material_Type
- 当Material_Desc = Material_Desc
- 当Material_Number <> Material_Number
样本Table:
Material_Type Material_Desc Material_Number
ABC XYZ 1
ABC XYZ 1
ABC XYZ 2
ABC XYZ 3
DEF IMM 1
LMN NULL 1
LMN NULL 2
我只想在 newTable 中有重复的值,并想删除不同的值。
期望的输出:
Material_Type Material_Desc Material_Number new
ABC XYZ 1 1
ABC XYZ 1 2
ABC XYZ 2 3
ABC XYZ 3 4
LMN NULL 1 1
LMN NULL 2 2
我使用了下面的查询但没有得到预期的输出,因为它不包括列 Material_Desc 中的空值并且没有使用 Null 进行分区并且还创建了不需要的重复记录。
使用的查询:
create table newTable as
with mycte as
(
select
m.MATERIAL_NUMBER
,m.MATERIAL_TYPE
,m.Material_Desc,
row_number() over(partition BY d.MATERIAL_TYPE,d.Material_Desc order by d.MATERIAL_NUMBER) as new
from baseTable m
inner join
(
select MATERIAL_NUMBER,MATERIAL_TYPE,Material_Desc,count(*) from baseTable group by
MATERIAL_NUMBER,MATERIAL_TYPE,Material_Desc having count(*) > 1
) d on d.MATERIAL_NUMBER <> m.MATERIAL_NUMBER and d.MATERIAL_TYPE=m.MATERIAL_TYPE
and d.Material_Desc= m.Material_Desc)
select * from mycte
如有任何帮助,我们将不胜感激。
只需使用 row_number()
和 count(*)
作为 window 函数:
select Material_Type, Material_Desc, Material_Number,
row_number() over (partition by Material_Type, Material_Desc order by Material_Number) as new
from (select t.*,
count(*) over (partition by Material_Type, Material_Desc) as cnt
from t
) t
where cnt > 1;
这适用于您提供的数据,只需计算每种类型和描述的行数。如果您确实需要 material 数字不同,一种方法是 min()
和 max()
:
select Material_Type, Material_Desc, Material_Number,
row_number() over (partition by Material_Type, Material_Desc order by Material_Number) as new
from (select t.*,
min(Material_Number) over (partition by Material_Type, Material_Desc) as min_Material_Number,
max(Material_Number) over (partition by Material_Type, Material_Desc) as max_Material_Number
from t
) t
where min_Material_Number <> max_Material_Number;