在 BigQuery 中按数组循环和合并循环结果
Loop by array and union looped result in BigQuery
我编写了一个脚本来执行以下操作
- 创建数组
day_event
,其中包含条件 1 下需要清理方法 1 的事件的 eventId
- 为需要清理方法 2 的条件 2 创建数组
night_event
- 为需要清理方法 3 的条件 3 创建数组
cross_day_event
- 通过
day_event
查询循环并使用方法 1 清理每个事件
- 通过
night_event
查询循环并使用方法 2 清理每个事件
- 通过
cross_day_event
查询循环并使用方法 3
清理每个事件
-- 注意:清理后,一行会分解成几行
- 查询原始数据但删除原始条目,并合并所有清理后的数据
我相信脚本本来可以更简洁,但我不知道该怎么做。有什么建议么?谢谢!
declare day_event array<int64>;
declare night_event array<int64>;
declare cross_day_event array<int64>;
declare i int64 default 0;
declare j int64 default 0;
declare k int64 default 0;
-- query three arrays based on different conditions.
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);
-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;
-- array 2 and condition 2
create or replace temp table night_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set j = j+1;
if j > array_length(night_event) then leave;
end if;
create or replace temp table night_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = night_event[ordinal(j)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from night_event_clean;
end loop;
-- array 3 and condition 3
create or replace temp table cross_day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set k = k+1;
if k > array_length(cross_day_event) then leave;
end if;
create or replace temp table cross_day_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = cross_day_event[ordinal(k)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp(date(endTime_adj)), interval 12 hour)) dt
union all select * from cross_day_event_clean;
end loop;
--query the original data and union with all the cleaned data
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId not in
(condition_1 or condition_2 or conditon_3) union all
select * from day_event_clean where eventId is not null union all
select * from night_event_clean where eventId is not null union all
select * from cross_day_event_clean where eventId is not null
order by startTime_adj;
快速浏览脚本顶部的快速推荐 #1
所以下面的片段太冗长了,最重要的是查询同一个 table 三次 - 所以成本是应该的 3 倍!!!
-- query three arrays based on different conditions.
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);
而不是上面你应该使用下面的简单并且只扫描一次你的table脚本
set (day_event, night_event, cross_day_event) = (
select as struct
array_agg(if(condition_1, id, null) ignore nulls ) array1,
array_agg(if(condition_2, id, null) ignore nulls ) array2,
array_agg(if(condition_3, id, null) ignore nulls ) array3
from (
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`
)
);
我希望上面的例子能让你在脚本的其余部分找到正确的方向,因为你真的不需要重复你的东西,只要你有条件。你只需要应用类似于上面的逻辑——这并不是一直那么简单——但在大多数情况下是非常可行的
另一个建议 - 在你的情况下使用循环根本没有道理 - 你不需要循环只是为了遍历数组元素 - 你宁愿以设定的方式进行(sql 方式- 在一个查询中)使用 unnest 函数
最后,如果您仍然需要帮助 - 我建议您简化循环逻辑示例并将其 post 作为一个单独的问题 - 否则它(当前问题)太宽泛而无法回答和解决所有可能的改进(这里要做的很多)
Update
还有几分钟时间跟进您的循环。
所以让我们改造你的第一个循环
而不是下面的脚本片段
-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;
您可以只使用一个简单的查询
create or replace temp table day_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj
from (
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`
where eventId in unnest(day_event)
) t
cross join unnest(
generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)
) dt
不仅不那么冗长而且更易于维护 - 它可以为您节省 $$$ 因为它不需要扫描 event_raw_data
table 与相应数组中的元素一样多的次数 - 上面的查询只是一个!!!
同样的方法适用于其余两个循环
最后,在你应用以上所有内容并最终得到紧凑且易于管理的脚本之后 - 我真的希望你会看到整个原始 [巨大] 脚本可以作为一个相对简单的查询来实现 - 只是一。正如我之前提到的 - 如果你到了这一点并且仍然需要帮助 - post 使用脚本的新问题,到那时你将能够得到
祝你好运:o)
我编写了一个脚本来执行以下操作
- 创建数组
day_event
,其中包含条件 1 下需要清理方法 1 的事件的 eventId - 为需要清理方法 2 的条件 2 创建数组
night_event
- 为需要清理方法 3 的条件 3 创建数组
cross_day_event
- 通过
day_event
查询循环并使用方法 1 清理每个事件
- 通过
night_event
查询循环并使用方法 2 清理每个事件
- 通过
cross_day_event
查询循环并使用方法 3
清理每个事件 -- 注意:清理后,一行会分解成几行 - 查询原始数据但删除原始条目,并合并所有清理后的数据
我相信脚本本来可以更简洁,但我不知道该怎么做。有什么建议么?谢谢!
declare day_event array<int64>;
declare night_event array<int64>;
declare cross_day_event array<int64>;
declare i int64 default 0;
declare j int64 default 0;
declare k int64 default 0;
-- query three arrays based on different conditions.
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);
-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;
-- array 2 and condition 2
create or replace temp table night_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set j = j+1;
if j > array_length(night_event) then leave;
end if;
create or replace temp table night_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = night_event[ordinal(j)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from night_event_clean;
end loop;
-- array 3 and condition 3
create or replace temp table cross_day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set k = k+1;
if k > array_length(cross_day_event) then leave;
end if;
create or replace temp table cross_day_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = cross_day_event[ordinal(k)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp(date(endTime_adj)), interval 12 hour)) dt
union all select * from cross_day_event_clean;
end loop;
--query the original data and union with all the cleaned data
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId not in
(condition_1 or condition_2 or conditon_3) union all
select * from day_event_clean where eventId is not null union all
select * from night_event_clean where eventId is not null union all
select * from cross_day_event_clean where eventId is not null
order by startTime_adj;
快速浏览脚本顶部的快速推荐 #1
所以下面的片段太冗长了,最重要的是查询同一个 table 三次 - 所以成本是应该的 3 倍!!!
-- query three arrays based on different conditions.
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);
而不是上面你应该使用下面的简单并且只扫描一次你的table脚本
set (day_event, night_event, cross_day_event) = (
select as struct
array_agg(if(condition_1, id, null) ignore nulls ) array1,
array_agg(if(condition_2, id, null) ignore nulls ) array2,
array_agg(if(condition_3, id, null) ignore nulls ) array3
from (
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`
)
);
我希望上面的例子能让你在脚本的其余部分找到正确的方向,因为你真的不需要重复你的东西,只要你有条件。你只需要应用类似于上面的逻辑——这并不是一直那么简单——但在大多数情况下是非常可行的
另一个建议 - 在你的情况下使用循环根本没有道理 - 你不需要循环只是为了遍历数组元素 - 你宁愿以设定的方式进行(sql 方式- 在一个查询中)使用 unnest 函数
最后,如果您仍然需要帮助 - 我建议您简化循环逻辑示例并将其 post 作为一个单独的问题 - 否则它(当前问题)太宽泛而无法回答和解决所有可能的改进(这里要做的很多)
Update
还有几分钟时间跟进您的循环。
所以让我们改造你的第一个循环
而不是下面的脚本片段
-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;
您可以只使用一个简单的查询
create or replace temp table day_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj
from (
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`
where eventId in unnest(day_event)
) t
cross join unnest(
generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)
) dt
不仅不那么冗长而且更易于维护 - 它可以为您节省 $$$ 因为它不需要扫描 event_raw_data
table 与相应数组中的元素一样多的次数 - 上面的查询只是一个!!!
同样的方法适用于其余两个循环
最后,在你应用以上所有内容并最终得到紧凑且易于管理的脚本之后 - 我真的希望你会看到整个原始 [巨大] 脚本可以作为一个相对简单的查询来实现 - 只是一。正如我之前提到的 - 如果你到了这一点并且仍然需要帮助 - post 使用脚本的新问题,到那时你将能够得到
祝你好运:o)