We store historic data from the BigQuery INFORMATION_SCHEMA.JOBS in order to be able to analyse historic jobs for longer than the 6 months after which BigQuery deletes them.
However on 2024-01-16 the schema changed, adding an additional field bi_engine_reasons into the query_info STRUCT at the column path query_info.performance_insights.stage_performance_standalone_insights.bi_engine_reasons and causing the insert job to fail due to schema mismatch.
After a huge amount of trial and error, I finally managed to manually construct a SQL query which adds a NULL version of this ARRAY in the correct position, however it was extremely laborious and time-consuming to identify and implement. It is also verbose and not reusable.
Is there a better way to achieve this in a cleaner, more concise and reusable manner?
Posting my (ugly) solution here, it's not pretty but it does work.
CREATE OR REPLACE TABLE `[NEW_SCHEMA_TABLE_ID]`
PARTITION BY DATE(creation_time)
AS
WITH
get_historic_data AS (
SELECT *
FROM `[OLD_SCHEMA_TABLE_ID]`
),
replace_query_info AS (
SELECT
creation_time,
project_id,
project_number,
user_email,
job_id,
job_type,
statement_type,
priority,
start_time,
end_time,
query,
state,
reservation_id,
total_bytes_processed,
total_slot_ms,
error_result,
cache_hit,
destination_table,
referenced_tables,
labels,
timeline,
job_stages,
total_bytes_billed,
transaction_id,
parent_job_id,
session_info,
dml_statistics,
total_modified_partitions,
bi_engine_statistics,
--query_info
STRUCT (
query_info.resource_warning,
query_info.optimization_details,
--query_hashes
STRUCT (
query_info.query_hashes.normalized_literals
) AS query_hashes,
--performance_insights
STRUCT (
query_info.performance_insights.avg_previous_execution_ms,
--stage_performance_standalone_insights
ARRAY( SELECT AS STRUCT
stage_performance_standalone_insight.stage_id AS stage_id,
stage_performance_standalone_insight.slot_contention,
stage_performance_standalone_insight.insufficient_shuffle_quota,
--bi_engine_reasons
ARRAY (SELECT AS STRUCT
CAST(NULL AS STRING) AS code,
CAST(NULL AS STRING) AS message
) AS bi_engine_reasons
) AS stage_performance_standalone_insights,
--stage_performance_change_insights
ARRAY( SELECT AS STRUCT
stage_performance_change_insight.stage_id AS stage_id,
STRUCT(stage_performance_change_insight.input_data_change.records_read_diff_percentage) AS input_data_change
) AS stage_performance_change_insights
) AS performance_insights
) AS query_info,
transferred_bytes,
materialized_view_statistics
FROM get_historic_data
LEFT JOIN UNNEST(query_info.performance_insights.stage_performance_standalone_insights) AS stage_performance_standalone_insight
LEFT JOIN UNNEST(query_info.performance_insights.stage_performance_change_insights) AS stage_performance_change_insight
)
SELECT *
FROM replace_query_info