I am using the below query to execute on AWS Athena. It gives error 'Query exhausted resources at this scale factor. '. In the query stats tab, I see that the data scanned is approximately 36 GB.
WITH session_dataset AS (
SELECT user_id,
max(medium) as medium,
max(event_date) as event_date,
session_id
FROM view_session
where date(event_date) <= date_add('day', - 1, current_date)
and date(event_date) >= date_add('day', - 90, current_date)
and category not in ('Offline Sources')
GROUP BY user_id,
session_id
),
user_conversion AS (
select user_id,
session_id,
name,
event_date,
has_crm,
customer_retention_type
from view_session
where cohort_type = 'conversion'
and name is not null
and date(event_date) <= date_add('day', - 1, current_date)
and date(event_date) >= date_add('day', - 90, current_date)
),
dataset_yesterday AS (
SELECT uc.user_id,
uc.name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff(
'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_') [ 1 ] asc
) as medium_list
FROM session_dataset sd,
user_conversion uc
where date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 1, current_date)
and uc.user_id = sd.user_id
and split(uc.session_id, '_') [ 1 ] >= split(sd.session_id, '_') [ 1 ]
GROUP BY uc.user_id,
uc.session_id,
uc.name
),
dataset_week AS (
SELECT uc.user_id,
uc.name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff(
'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_') [ 1 ] asc
) as medium_list
FROM session_dataset sd,
user_conversion uc
where date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 7, current_date)
and uc.user_id = sd.user_id
and split(uc.session_id, '_') [ 1 ] >= split(sd.session_id, '_') [ 1 ]
GROUP BY uc.user_id,
uc.session_id,
uc.name
),
dataset_month AS (
SELECT uc.user_id,
uc.name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff(
'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_') [ 1 ] asc
) as medium_list
FROM session_dataset sd,
user_conversion uc
where date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 30, current_date)
and uc.user_id = sd.user_id
and split(uc.session_id, '_') [ 1 ] >= split(sd.session_id, '_') [ 1 ]
GROUP BY uc.user_id,
uc.session_id,
uc.name
),
dataset_quarter AS (
SELECT uc.user_id,
uc.name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff(
'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_') [ 1 ] asc
) as medium_list
FROM session_dataset sd,
user_conversion uc
where date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 90, current_date)
and uc.user_id = sd.user_id
and split(uc.session_id, '_') [ 1 ] >= split(sd.session_id, '_') [ 1 ]
GROUP BY uc.user_id,
uc.session_id,
uc.name
)
select 'yesterday' as window,
name,
sum(days_convert) as days_convert,
count(name) as total_conversion,
sum(view_count) as total_view,
count(
distinct IF(has_crm = '1', user_id, NULL)
) AS customer_count,
count(distinct IF(has_crm != '1' or has_crm is null, user_id, NULL)) AS anonymous_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'returning',
user_id,
NULL
)
) AS returning_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'new',
user_id,
NULL
)
) AS new_customer_count,
medium_list [ 1 ] as first_click,
medium_list [ cardinality(medium_list) ] as last_click,
medium_list
from dataset_yesterday
group by name,
medium_list
union all
select 'month' as window,
name,
sum(days_convert) as days_convert,
count(name) as total_conversion,
sum(view_count) as total_view,
count(
distinct IF(has_crm = '1', user_id, NULL)
) AS customer_count,
count(distinct IF(has_crm != '1' or has_crm is null, user_id, NULL)) AS anonymous_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'returning',
user_id,
NULL
)
) AS returning_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'new',
user_id,
NULL
)
) AS new_customer_count,
medium_list [ 1 ] as first_click,
medium_list [ cardinality(medium_list) ] as last_click,
medium_list
from dataset_month
group by name,
medium_list
union all
union all
select 'quarter' as window,
name,
sum(days_convert) as days_convert,
count(name) as total_conversion,
sum(view_count) as total_view,
count(
distinct IF(has_crm = '1', user_id, NULL)
) AS customer_count,
count(distinct IF(has_crm != '1' or has_crm is null, user_id, NULL)) AS anonymous_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'returning',
user_id,
NULL
)
) AS returning_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'new',
user_id,
NULL
)
) AS new_customer_count,
medium_list [ 1 ] as first_click,
medium_list [ cardinality(medium_list) ] as last_click,
from dataset_quarter
group by name,
medium_list
I found similar queries in Stack Overflow. In one post, they asked to remove the order by clause.
How can I do that in the above query?
We were able to resolve this by using CTAS temporary tables. In the query given in the question, everything is in cache calculation and hence it was consuming more DPU.
We created temporary tables at runtime and stored data for session_dataset, user_conversion and then finally executed the select statement on these temporary tables.
We are using DBT so all the above things can be handled programmatically. Please find the solution below:
===============