I wrote a script to do the following
- create array
day_event
that contains eventId for events under condition 1 that needs cleaning method 1 - create array
night_event
for condition 2 that needs cleaning method 2 - create array
cross_day_event
for condition 3 that needs cleaning method 3 - query loop thru
day_event
and clean each event using method 1 - query loop thru
night_event
and clean each event using method 2 - query loop thru
cross_day_event
and clean each event using method 3
-- NOTE: after cleaning, one row will be broken down to several rows - query original data but removing the original entry, and union all the cleaned data
I believe the script could have been a lot cleaner but I don't know how. Any suggestions? Thanks!
declare day_event array<int64>;
declare night_event array<int64>;
declare cross_day_event array<int64>;
declare i int64 default 0;
declare j int64 default 0;
declare k int64 default 0;
-- query three arrays based on different conditions.
-- each condition needs a slightly different cleaning method.
set day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_1
);
set night_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_2
);
set cross_day_event = array(
select eventId from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj
from `event_raw_data`)
where condition_3
);
-- array 1 and condition 1
create or replace temp table day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set i = i+1;
if i > array_length(day_event) then leave;
end if;
create or replace temp table day_event_clean as
-- here is the cleaning I need to do
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = day_event[ordinal(i)]) t cross join
unnest(generate_timestamp_array(timestamp(date(startTime_adj)), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from day_event_clean;
-- end of cleaning
end loop;
-- array 2 and condition 2
create or replace temp table night_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set j = j+1;
if j > array_length(night_event) then leave;
end if;
create or replace temp table night_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = night_event[ordinal(j)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp_add(timestamp(date(endTime_adj)), interval 12 hour), interval 12 hour)) dt
union all select * from night_event_clean;
end loop;
-- array 3 and condition 3
create or replace temp table cross_day_event_clean as
select cast(null as int64) as eventId, cast(null as timestamp) as startTime_adj, cast(null as timestamp) as endTime_adj;
loop
set k = k+1;
if k > array_length(cross_day_event) then leave;
end if;
create or replace temp table cross_day_event_clean as
select t.eventId, greatest(startTime_adj, dt) as startTime_adj, least(timestamp_add(dt, interval 12 hour), endTime_adj) as endTime_adj from
(select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId = cross_day_event[ordinal(k)]) t cross join
unnest(generate_timestamp_array(timestamp_add(timestamp(date(startTime_adj)), interval 12 hour), timestamp(date(endTime_adj)), interval 12 hour)) dt
union all select * from cross_day_event_clean;
end loop;
--query the original data and union with all the cleaned data
select eventId, timestamp_sub(startTime_CST, interval +6 hour) as startTime_adj, timestamp_sub(endTime_CST, interval +6 hour) as endTime_adj from `event_raw_data` where eventId not in
(condition_1 or condition_2 or conditon_3) union all
select * from day_event_clean where eventId is not null union all
select * from night_event_clean where eventId is not null union all
select * from cross_day_event_clean where eventId is not null
order by startTime_adj;
Quick recommendation #1 from quick glance of top part of your script
So below fragment is too verbose and most importantly queries the very same table three times - so cost is 3 times more than it should be!!!
instead of above you should use below simple and with just one scan of your table script
I hope above example will put you into right direction with rest of your script as you do not really need to repeat you stuff as many times as conditions you do have. you just need to apply similar to above logic - which is not that simple all the times - but in most cases is quite doable
Yet another advice - use of loops in your case is not justified at all - you don't need to do loop just to iterate through arrays elements - you rather do it in set way (sql way - in one query) using unnest function
Finally, if you still will need a help - I recommend you to simplify example of your loop logic and post it as a separate question - otherwise it (current question) is too broad to answer and address all possible improvements (which a quite a tons here to be made)
Got some few extra minutes to follow up on your loops.
So let's transform your first loop
Instead of below fragment of your script
you can simply use just one simple query
not only it is less verbose and easier to maintain - it saves you $$$ because instead of scanning
event_raw_data
table as many times as elements in respective array - above query does it just ones!!!same approach applicable to the rest two loops
and finally, after you will apply all above and will end up with compact and manageable script - I really hope that you will see that the whole your original [huge] script can be implemented as one relatively simple query - just one. as I mentioned before - if you will get to this point and still will need help - post new question using script that you wil be able to get to by that moment
good luck :o)