在 BigQuery 中按数组循环和合并循环结果

Loop by array and union looped result in BigQuery



declare day_event array<int64>;
declare night_event array<int64>;
declare cross_day_event array<int64>;
declare i int64 default 0;
declare j int64 default 0;
declare k int64 default 0;

-- query three arrays based on different conditions. 
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3

-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;

-- array 2 and condition 2
create or replace temp table night_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

set j = j+1;
if j > array_length(night_event) then leave;
end if;
create or replace temp table night_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = night_event[ordinal(j)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from night_event_clean;
end loop;

-- array 3 and condition 3
create or replace temp table cross_day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

set k = k+1;
if k > array_length(cross_day_event) then leave;
end if;
create or replace temp table cross_day_event_clean as 
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = cross_day_event[ordinal(k)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp(date(endTime_adj)), interval 12 hour)) dt
union all select * from cross_day_event_clean;
end loop;

--query the original data and union with all the cleaned data
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId not in
(condition_1 or condition_2 or conditon_3) union all
select * from day_event_clean where eventId is not null union all
select * from night_event_clean where eventId is not null union all
select * from cross_day_event_clean where eventId is not null
order by startTime_adj;

快速浏览脚本顶部的快速推荐 #1

所以下面的片段太冗长了,最重要的是查询同一个 table 三次 - 所以成本是应该的 3 倍!!!

set (day_event, night_event, cross_day_event) = (
  select as struct
    array_agg(if(condition_1, id, null) ignore nulls ) array1,
    array_agg(if(condition_2, id, null) ignore nulls ) array2,
    array_agg(if(condition_3, id, null) ignore nulls ) array3
  from (
    select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
    from `event_raw_data` 


另一个建议 - 在你的情况下使用循环根本没有道理 - 你不需要循环只是为了遍历数组元素 - 你宁愿以设定的方式进行(sql 方式- 在一个查询中)使用 unnest 函数

最后,如果您仍然需要帮助 - 我建议您简化循环逻辑示例并将其 post 作为一个单独的问题 - 否则它(当前问题)太宽泛而无法回答和解决所有可能的改进(这里要做的很多)



create or replace temp table day_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj 
from (
  select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj 
  from `event_raw_data` 
  where eventId in unnest(day_event)
) t 
cross join unnest(
  generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)
) dt     

不仅不那么冗长而且更易于维护 - 它可以为您节省 $$$ 因为它不需要扫描 event_raw_data table 与相应数组中的元素一样多的次数 - 上面的查询只是一个!!!


最后,在你应用以上所有内容并最终得到紧凑且易于管理的脚本之后 - 我真的希望你会看到整个原始 [巨大] 脚本可以作为一个相对简单的查询来实现 - 只是一。正如我之前提到的 - 如果你到了这一点并且仍然需要帮助 - post 使用脚本的新问题,到那时你将能够得到
