Python Pandas resample year data using day time frame data

434 Views Asked by At

This is small sample of the daily ohlc data I have stored in df1.

date                open    close   high    low
2019-01-01 00:00:00 3700    3800    3806    3646
2019-01-02 00:00:00 3800    3857    3880    3750
2019-01-03 00:00:00 3858    3766    3863    3729
2019-01-04 00:00:00 3768    3791    3821    3706
2019-01-05 00:00:00 3789    3772    3839    3756
2019-01-06 00:00:00 3776    3988    4023    3747
2019-01-07 00:00:00 3985    3972    4018    3928

I want to create a dataframe (df2) that represents what the active year candle looks like as it progresses. The close is based on the current days close, the high is the max from Jan1 to current day, the low is the min from Jan1 to current day and the open is based on the open of the year. Which should look something like this:

date                open    close   high    low
2019-01-01 00:00:00 3700    3800    3806    3646
2019-01-02 00:00:00 3700    3857    3880    3646
2019-01-03 00:00:00 3700    3766    3880    3646
2019-01-04 00:00:00 3700    3791    3880    3646
2019-01-05 00:00:00 3700    3772    3880    3646
2019-01-06 00:00:00 3700    3988    4023    3646
2019-01-07 00:00:00 3700    3972    4023    3646

Would love to put some code but I'm lost here, I thought resample would help me but it just summarizes the entire year into one row of data. I also want think I can figure this out iterating through everyday and resampling but I know that slows computation a lot so I'm hoping to see if this is possible with vectorization. This is my first time posting so let me know if there is any guidelines I need to improve on.

---------------EDIT------------------

Here is my full code with year working but other time frames not working, hopefully it will be easier to replicate the bad results as im pulling data from public source yfinance.

import pandas as pd
import yfinance as yf

#not working
def resample_active_week(df):
    df2 = pd.DataFrame()

    # high is the max from Jan1 to current day
    df2['high'] = df.groupby(df.index.isocalendar().week)['high'].cummax()

    # low is the min from Jan1 to current day 
    df2['low'] = df.groupby(df.index.isocalendar().week)['low'].cummin()

    #close
    df2['close'] = df['close']

    # open is based on the open of the current week
    df2['open'] = df.groupby(df.index.isocalendar().week)['open'].head(1)
    df2=df2.fillna(method='ffill')

    return df2
#not working    
def resample_active_month(df):
    df2 = pd.DataFrame()

    # high is the max from Jan1 to current day
    df2['high'] = df.groupby(df.index.month)['high'].cummax()

    # low is the min from Jan1 to current day 
    df2['low'] = df.groupby(df.index.month)['low'].cummin()

    #close
    df2['close'] = df['close']

    # open is based on the open of the current month
    df2['open'] = df.groupby(df.index.month)['open'].head(1)
    df2=df2.fillna(method='ffill')

    return df2

#not working
def resample_active_quarter(df):
    df2 = pd.DataFrame()

    # high is the max from Jan1 to current day
    df2['high'] = df.groupby(df.index.quarter)['high'].cummax()

    # low is the min from Jan1 to current day 
    df2['low'] = df.groupby(df.index.quarter)['low'].cummin()

    #close
    df2['close'] = df['close']

    # open is based on the open of the current quarter
    df2['open'] = df.groupby(df.index.quarter)['open'].head(1)
    df2=df2.fillna(method='ffill')

    return df2
#working
def resample_active_year(df):
    df2 = pd.DataFrame()
    
    # high is the max from Jan1 to current day
    df2['high'] = df.groupby(df.index.year)['high'].cummax()

    # low is the min from Jan1 to current day 
    df2['low'] = df.groupby(df.index.year)['low'].cummin()

    #close
    df2['close'] = df['close']

    # open is based on the open of the current year
    df2['open'] = df.groupby(df.index.year)['open'].head(1)
    df2=df2.fillna(method='ffill')

    return df2

df = yf.download(tickers='BTC-USD', period = 'max', interval = '1d',auto_adjust = True)
df.rename(columns={'Open':'open', 'High':'high','Low':'low','Close':'close'}, inplace=True)
df = df.drop(['Volume'],axis=1)

df2 = resample_active_week(df)
df3 = resample_active_month(df)
df4 = resample_active_quarter(df)
df5 = resample_active_year(df)

with pd.ExcelWriter('ResampleOut.xlsx', engine="openpyxl", mode="w") as writer:
            df.to_excel(writer, sheet_name='df_original')
            df2.to_excel(writer, sheet_name='df2_week')
            df3.to_excel(writer, sheet_name='df3_month')
            df4.to_excel(writer, sheet_name='df4_quarter')
            df5.to_excel(writer, sheet_name='df5_year')
1

There are 1 best solutions below

5
On BEST ANSWER
# set date as the index
df = df.set_index('date')

# high is the max from Jan1 to current day
df['max'] = df.groupby(df.index.year)['max'].cummax()

# low is the min from Jan1 to current day 
df['min'] = df.groupby(df.index.year)['min'].cummin()

# open is based on the open of the year
for ind, row in df.iterrows():
    row['open'] = df.loc[ind.replace(month=1, day=1), 'open']

# OPTIONAL: reset index
df = df.reset_index()