Loop by array and union looped result in BigQuery

2.2k Views Asked by At

I wrote a script to do the following

  • create array day_event that contains eventId for events under condition 1 that needs cleaning method 1
  • create array night_event for condition 2 that needs cleaning method 2
  • create array cross_day_event for condition 3 that needs cleaning method 3
  • query loop thru day_event and clean each event using method 1
  • query loop thru night_event and clean each event using method 2
  • query loop thru cross_day_event and clean each event using method 3
    -- NOTE: after cleaning, one row will be broken down to several rows
  • query original data but removing the original entry, and union all the cleaned data

I believe the script could have been a lot cleaner but I don't know how. Any suggestions? Thanks!

declare day_event array<int64>;
declare night_event array<int64>;
declare cross_day_event array<int64>;
declare i int64 default 0;
declare j int64 default 0;
declare k int64 default 0;

-- query three arrays based on different conditions. 
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);

-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

loop 
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;

-- array 2 and condition 2
create or replace temp table night_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

loop
set j = j+1;
if j > array_length(night_event) then leave;
end if;
create or replace temp table night_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = night_event[ordinal(j)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from night_event_clean;
end loop;

-- array 3 and condition 3
create or replace temp table cross_day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

loop
set k = k+1;
if k > array_length(cross_day_event) then leave;
end if;
create or replace temp table cross_day_event_clean as 
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = cross_day_event[ordinal(k)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp(date(endTime_adj)), interval 12 hour)) dt
union all select * from cross_day_event_clean;
end loop;

--query the original data and union with all the cleaned data
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId not in
(condition_1 or condition_2 or conditon_3) union all
select * from day_event_clean where eventId is not null union all
select * from night_event_clean where eventId is not null union all
select * from cross_day_event_clean where eventId is not null
order by startTime_adj;
1

There are 1 best solutions below

1
On BEST ANSWER

Quick recommendation #1 from quick glance of top part of your script

So below fragment is too verbose and most importantly queries the very same table three times - so cost is 3 times more than it should be!!!

-- query three arrays based on different conditions. 
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);    

instead of above you should use below simple and with just one scan of your table script

set (day_event, night_event, cross_day_event) = (
  select as struct
    array_agg(if(condition_1, id, null) ignore nulls ) array1,
    array_agg(if(condition_2, id, null) ignore nulls ) array2,
    array_agg(if(condition_3, id, null) ignore nulls ) array3
  from (
    select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
    from `event_raw_data` 
  )
);   

I hope above example will put you into right direction with rest of your script as you do not really need to repeat you stuff as many times as conditions you do have. you just need to apply similar to above logic - which is not that simple all the times - but in most cases is quite doable

Yet another advice - use of loops in your case is not justified at all - you don't need to do loop just to iterate through arrays elements - you rather do it in set way (sql way - in one query) using unnest function

Finally, if you still will need a help - I recommend you to simplify example of your loop logic and post it as a separate question - otherwise it (current question) is too broad to answer and address all possible improvements (which a quite a tons here to be made)

Update

Got some few extra minutes to follow up on your loops.
So let's transform your first loop
Instead of below fragment of your script

-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;

loop 
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from 
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;   

you can simply use just one simple query

create or replace temp table day_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj 
from (
  select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj 
  from `event_raw_data` 
  where eventId in unnest(day_event)
) t 
cross join unnest(
  generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)
) dt     

not only it is less verbose and easier to maintain - it saves you $$$ because instead of scanning event_raw_data table as many times as elements in respective array - above query does it just ones!!!

same approach applicable to the rest two loops

and finally, after you will apply all above and will end up with compact and manageable script - I really hope that you will see that the whole your original [huge] script can be implemented as one relatively simple query - just one. as I mentioned before - if you will get to this point and still will need help - post new question using script that you wil be able to get to by that moment

good luck :o)