Trend calculation using snowflake sql

78 Views Asked by At

I have a request to get the trend for Nov-22 and Dec-22 based on the below data for x 9 & 10. Is it possible to use any of the sql linear regression function to accomplish this request ?

below is a sample, I am trying to get the trend for the unknown (Nov & Dec)

with
data_table(month, x, y) as
(
    select * from values
      ('Mar-22',1 ,7894),
      ('Apr-22',2 ,7964),
      ('May-22',3 ,8016),
      ('Jun-22', 4, 8005),
      ('Jul-22', 5, 8063),
      ('Aug-22', 6, 8101),
      ('Sep-22', 7, 8101),
      ('Oct-22', 8, 8204)
)
SELECT * FROM data_table;

I would like the output to be like:

Month Order Amount Trend
Mar-22 1 7894 7937.622222
Apr-22 2 7964 7983.272222
May-22 3 8016 8028.922222
Jun-22 4 8005 8074.572222
Jul-22 5 8063 8120.222222
Aug-22 6 8101 8165.872222
Sep-22 7 8101 8211.522222
Oct-22 8 8204 8257.172222
Nov-22 9 8305 8302.822222
2

There are 2 best solutions below

7
Simeon Pilgrim On BEST ANSWER
with data_table(month, x, y) as
(
    select * from values
      ('Mar-22',1 ,7894),
      ('Apr-22',2 ,7964),
      ('May-22',3 ,8016),
      ('Jun-22',4, 8005),
      ('Jul-22',5, 8063),
      ('Aug-22',6, 8101),
      ('Sep-22',7 , 8101),
      ('Oct-22',8 , 8204)
)
select
    REGR_SLOPE(y, x) as rs
    ,REGR_INTERCEPT(y,x) as ri
from data_table;

works as expected. And thus the extra values can be found:

with data_table(month, x, y) as
(
    select * from values
      ('Mar-22',1 ,7894),
      ('Apr-22',2 ,7964),
      ('May-22',3 ,8016),
      ('Jun-22',4, 8005),
      ('Jul-22',5, 8063),
      ('Aug-22',6, 8101),
      ('Sep-22',7 , 8101),
      ('Oct-22',8 , 8204)
)
select 
    nx,
    round(ri + (nx*rs),1) as ny
from values (9),(10) as n(nx)
cross join (    
    select
        REGR_SLOPE(y, x) as rs
        ,REGR_INTERCEPT(y,x) as ri
    from data_table
)
NX NY
9 8213.2
10 8250.9

with estimated values:

with data_table(month, x, y) as
(
    select * from values
      ('Mar-22',1 ,7894),
      ('Apr-22',2 ,7964),
      ('May-22',3 ,8016),
      ('Jun-22',4, 8005),
      ('Jul-22',5, 8063),
      ('Aug-22',6, 8101),
      ('Sep-22',7 , 8101),
      ('Oct-22',8 , 8204)
), output as (
    select * from data_table
    union all 
    select * from values
     ('Nov-22',9 ,null),
     ('Dec-22',10 ,null)
)
select 
    n.*,
    round(ri + (n.x*rs),1) as ny
from output as n
cross join (    
    select
        REGR_SLOPE(y, x) as rs
        ,REGR_INTERCEPT(y,x) as ri
    from data_table
)
order by n.x
MONTH X Y NY
Mar-22 1 7894 7911.5
Apr-22 2 7964 7949.2
May-22 3 8016 7986.9
Jun-22 4 8005 8024.6
Jul-22 5 8063 8062.4
Aug-22 6 8101 8100.1
Sep-22 7 8101 8137.8
Oct-22 8 8204 8175.5
Nov-22 9 8213.2
Dec-22 10 8250.9
2
Racoding On

You inspired me to go and implement linear regression in SQL Server. This could be modified for MySQL/Oracle/Whatever without too much trouble. It's the mathematically best way of determining the trend over the hour for each entity_id and it will select out only the ones with a positive trend.

It implements the formula for calculating B1hat listed here: https://en.wikipedia.org/wiki/Regression_analysis#Linear_regression

create table #temp
(
    entity_id int,
    value int,
    [date] datetime
)

insert into #temp (entity_id, value, [date])
values
(1,10,'20140102 07:00:00 AM'),
(1,20,'20140102 07:15:00 AM'),
(1,30,'20140102 07:30:00 AM'),
(2,50,'20140102 07:00:00 AM'),
(2,20,'20140102 07:47:00 AM'),
(3,40,'20140102 07:00:00 AM'),
(3,40,'20140102 07:52:00 AM')

select entity_id, 1.0*sum((x-xbar)*(y-ybar))/sum((x-xbar)*(x-xbar)) as Beta
from
(
    select entity_id,
        avg(value) over(partition by entity_id) as ybar,
        value as y,
        avg(datediff(second,'20140102 07:00:00 AM',[date])) over(partition by entity_id) as xbar,
        datediff(second,'20140102 07:00:00 AM',[date]) as x
    from #temp
    where [date]>='20140102 07:00:00 AM' and [date]<'20140102 08:00:00 AM'
) as Calcs
group by entity_id
having 1.0*sum((x-xbar)*(y-ybar))/sum((x-xbar)*(x-xbar))>0