Snowflake 查询具有过滤条件的分区
Snowflake query a partition with filter condition
我在 Snowflake table 中有一个基于键值的记录,其中对于给定的 product_id 有几十个键值对记录。请参阅以下示例:
with t1 (product_id, key, value) as
(
select 101, 'grade', 'high' union all
select 101, 'expense_cost', 'high' union all
select 101, 'maintenance_cost', 'medium' union all
select 102, 'grade', 'medium' union all
select 102, 'expense_cost', 'high' union all
select 103, 'expense_cost', 'high' union all
select 103, 'maintenance_cost', 'medium'
)
select * from t1;
鉴于此数据模型,要求是获取 product_id 与键值过滤条件相匹配的数据。
示例 1:获取所有 product_id,其中 key=(grade) 具有 value=(high or medium) & key=(expense_cost)具有价值=(高)
示例 2:获取所有 product_id where key=(grade) has value=(high) & key=(maintenance_cost) has value= (高或中)
我可以使用 Snowflake PIVOT
函数解决此要求,该函数首先将键值数据结构转换为列数据结构,然后使用 [=14= 应用 Filter
条件] 条款。有没有更好的方法在不使用 PIVOT 的情况下解决这个问题,例如使用一些 Window 功能等?
示例 1 的基于 PIVOT 的解决方案:
with t1 (product_id, key, value) as
(
select 101, 'grade', 'high' union all
select 101, 'expense_cost', 'high' union all
select 101, 'maintenance_cost', 'medium' union all
select 102, 'grade', 'medium' union all
select 102, 'expense_cost', 'high' union all
select 103, 'expense_cost', 'high' union all
select 103, 'maintenance_cost', 'medium'
)
select * from (
select product_id, key, value
from t1
where key in ('grade','expense_cost','maintenance_cost')
) pivot(min(value) for key in ('grade','expense_cost','maintenance_cost'))
as p (product_id, grade, expense_cost, maintenance_cost)
where grade in ('high','medium')
and expense_cost in ('high');
注意:在使用 Window 函数的情况下,输出必须仅包含符合条件或通过过滤条件的行。输出不得包含合格分区中的所有记录。
要使用的模式是 QUALIFY
结合 COUNT_IF
。如果以后需要,可以旋转它:
"Requirement 1: Fetch all product_id where grade is (high or medium) & expense_cost = high"
with t1 (product_id, key, value) as
(
select 101, 'grade', 'high' union all
select 101, 'expense_cost', 'high' union all
select 101, 'maintenance_cost', 'medium' union all
select 102, 'grade', 'medium' union all
select 102, 'expense_cost', 'high' union all
select 103, 'expense_cost', 'high' union all
select 103, 'maintenance_cost', 'medium'
)
select *
from t1
qualify COUNT_IF(key='grade' AND value IN ('medium', 'high'))
OVER(PARTITION BY product_id) > 0
AND COUNT_IF(key = 'expense_cost' AND value = 'high')
OVER(PARTITION BY product_id) > 0;
输出:
Requirement 2: Fetch all product_id where key=(grade) has value=(high) & key=(maintenance_cost) has value=(high or medium)
with t1 (product_id, key, value) as
(
select 101, 'grade', 'high' union all
select 101, 'expense_cost', 'high' union all
select 101, 'maintenance_cost', 'medium' union all
select 102, 'grade', 'medium' union all
select 102, 'expense_cost', 'high' union all
select 103, 'expense_cost', 'high' union all
select 103, 'maintenance_cost', 'medium'
)
select *
from t1
qualify COUNT_IF(key='grade' AND value IN ('high'))
OVER(PARTITION BY product_id) > 0
AND COUNT_IF(key = 'maintenance_cost' AND value IN ('medium','high'))
OVER(PARTITION BY product_id) > 0
AND COUNT_IF(key='expense_cost' AND value IN ('high'))
OVER(PARTITION BY product_id) = 0 -- explicitly excluding
输出:
我认为 self-documenting 如果您 sub-select 使用 conditional-aggregation 排位赛 product_ids,然后过滤排位赛 product_ids 加上条件where 子句。下面是要求 #2 的演示,但您可以轻松地针对 #1 对其进行修改。
with t (product_id, keys, value) as
(select 101, 'grade','high' union all
select 101, 'expense_cost','high' union all
select 101, 'maintenance_cost','medium' union all
select 102, 'grade','medium' union all
select 102, 'expense_cost','high' union all
select 103, 'expense_cost','high' union all
select 103, 'maintenance_cost','medium' ),
product_ids as
(select product_id
from t
group by product_id
having sum(case when keys='grade' and value ='high' then 1 end)>0 and
sum(case when keys='maintenance_cost' and value in ('high', 'medium') then 1 end)>0)
select *
from t
where product_id in (select product_id from product_ids) and
((keys='grade' and value ='high') or (keys='maintenance_cost' and value in ('high', 'medium')))
在下面添加我的解决方案,这是对@Lukasz Szozda 解决方案的改进,该解决方案存在以下两个问题:
一个。对于过滤器标准中的每个属性,他的代码需要添加一个
增加雪花计算的额外 Window 函数
线性时间。
b。有许多属性需要从结果集中删除,并且为每个属性添加这么多 Window 函数会使 SQL 查询非常臃肿。
这是我改进后的代码:
with t1 (product_id, key, value) as
(
select 101, 'grade', 'high' union all
select 101, 'expense_cost', 'high' union all
select 101, 'maintenance_cost', 'medium' union all
select 102, 'grade', 'medium' union all
select 102, 'expense_cost', 'high' union all
select 103, 'expense_cost', 'high' union all
select 103, 'maintenance_cost', 'medium'
)
select * ,
(CASE
WHEN key = 'grade' AND value IN ('medium', 'high') THEN TRUE
WHEN key = 'expense_cost' AND value = 'high' THEN TRUE
ELSE FALSE
END) AS is_allowed
from t1
WHERE key IN ('grade', 'expense_cost')
QUALIFY COUNT_IF(is_allowed=TRUE) OVER(PARTITION BY product_id) = 2;
我在 Snowflake table 中有一个基于键值的记录,其中对于给定的 product_id 有几十个键值对记录。请参阅以下示例:
with t1 (product_id, key, value) as
(
select 101, 'grade', 'high' union all
select 101, 'expense_cost', 'high' union all
select 101, 'maintenance_cost', 'medium' union all
select 102, 'grade', 'medium' union all
select 102, 'expense_cost', 'high' union all
select 103, 'expense_cost', 'high' union all
select 103, 'maintenance_cost', 'medium'
)
select * from t1;
鉴于此数据模型,要求是获取 product_id 与键值过滤条件相匹配的数据。
示例 1:获取所有 product_id,其中 key=(grade) 具有 value=(high or medium) & key=(expense_cost)具有价值=(高)
示例 2:获取所有 product_id where key=(grade) has value=(high) & key=(maintenance_cost) has value= (高或中)
我可以使用 Snowflake PIVOT
函数解决此要求,该函数首先将键值数据结构转换为列数据结构,然后使用 [=14= 应用 Filter
条件] 条款。有没有更好的方法在不使用 PIVOT 的情况下解决这个问题,例如使用一些 Window 功能等?
示例 1 的基于 PIVOT 的解决方案:
with t1 (product_id, key, value) as
(
select 101, 'grade', 'high' union all
select 101, 'expense_cost', 'high' union all
select 101, 'maintenance_cost', 'medium' union all
select 102, 'grade', 'medium' union all
select 102, 'expense_cost', 'high' union all
select 103, 'expense_cost', 'high' union all
select 103, 'maintenance_cost', 'medium'
)
select * from (
select product_id, key, value
from t1
where key in ('grade','expense_cost','maintenance_cost')
) pivot(min(value) for key in ('grade','expense_cost','maintenance_cost'))
as p (product_id, grade, expense_cost, maintenance_cost)
where grade in ('high','medium')
and expense_cost in ('high');
注意:在使用 Window 函数的情况下,输出必须仅包含符合条件或通过过滤条件的行。输出不得包含合格分区中的所有记录。
要使用的模式是 QUALIFY
结合 COUNT_IF
。如果以后需要,可以旋转它:
"Requirement 1: Fetch all product_id where grade is (high or medium) & expense_cost = high"
with t1 (product_id, key, value) as
(
select 101, 'grade', 'high' union all
select 101, 'expense_cost', 'high' union all
select 101, 'maintenance_cost', 'medium' union all
select 102, 'grade', 'medium' union all
select 102, 'expense_cost', 'high' union all
select 103, 'expense_cost', 'high' union all
select 103, 'maintenance_cost', 'medium'
)
select *
from t1
qualify COUNT_IF(key='grade' AND value IN ('medium', 'high'))
OVER(PARTITION BY product_id) > 0
AND COUNT_IF(key = 'expense_cost' AND value = 'high')
OVER(PARTITION BY product_id) > 0;
输出:
Requirement 2: Fetch all product_id where key=(grade) has value=(high) & key=(maintenance_cost) has value=(high or medium)
with t1 (product_id, key, value) as
(
select 101, 'grade', 'high' union all
select 101, 'expense_cost', 'high' union all
select 101, 'maintenance_cost', 'medium' union all
select 102, 'grade', 'medium' union all
select 102, 'expense_cost', 'high' union all
select 103, 'expense_cost', 'high' union all
select 103, 'maintenance_cost', 'medium'
)
select *
from t1
qualify COUNT_IF(key='grade' AND value IN ('high'))
OVER(PARTITION BY product_id) > 0
AND COUNT_IF(key = 'maintenance_cost' AND value IN ('medium','high'))
OVER(PARTITION BY product_id) > 0
AND COUNT_IF(key='expense_cost' AND value IN ('high'))
OVER(PARTITION BY product_id) = 0 -- explicitly excluding
输出:
我认为 self-documenting 如果您 sub-select 使用 conditional-aggregation 排位赛 product_ids,然后过滤排位赛 product_ids 加上条件where 子句。下面是要求 #2 的演示,但您可以轻松地针对 #1 对其进行修改。
with t (product_id, keys, value) as
(select 101, 'grade','high' union all
select 101, 'expense_cost','high' union all
select 101, 'maintenance_cost','medium' union all
select 102, 'grade','medium' union all
select 102, 'expense_cost','high' union all
select 103, 'expense_cost','high' union all
select 103, 'maintenance_cost','medium' ),
product_ids as
(select product_id
from t
group by product_id
having sum(case when keys='grade' and value ='high' then 1 end)>0 and
sum(case when keys='maintenance_cost' and value in ('high', 'medium') then 1 end)>0)
select *
from t
where product_id in (select product_id from product_ids) and
((keys='grade' and value ='high') or (keys='maintenance_cost' and value in ('high', 'medium')))
在下面添加我的解决方案,这是对@Lukasz Szozda 解决方案的改进,该解决方案存在以下两个问题:
一个。对于过滤器标准中的每个属性,他的代码需要添加一个 增加雪花计算的额外 Window 函数 线性时间。
b。有许多属性需要从结果集中删除,并且为每个属性添加这么多 Window 函数会使 SQL 查询非常臃肿。
这是我改进后的代码:
with t1 (product_id, key, value) as
(
select 101, 'grade', 'high' union all
select 101, 'expense_cost', 'high' union all
select 101, 'maintenance_cost', 'medium' union all
select 102, 'grade', 'medium' union all
select 102, 'expense_cost', 'high' union all
select 103, 'expense_cost', 'high' union all
select 103, 'maintenance_cost', 'medium'
)
select * ,
(CASE
WHEN key = 'grade' AND value IN ('medium', 'high') THEN TRUE
WHEN key = 'expense_cost' AND value = 'high' THEN TRUE
ELSE FALSE
END) AS is_allowed
from t1
WHERE key IN ('grade', 'expense_cost')
QUALIFY COUNT_IF(is_allowed=TRUE) OVER(PARTITION BY product_id) = 2;