Python : Extract specific string pattern from a pandas columns and storing into new columns

289 Views Asked by At

I have a Dataframe in the below format:

Ticker
ABAN10OCTFUT
ABAN10SEP700PA
ABAN10SEP720PA
ABAN10SEP740PA
ABAN10SEP760PA

I am trying to extract instrument_name, year, month,strike_price and instrument_type from the ticker column and storing the same in the existing dataframe. I'm able to extract the values but I couldn't able to save the values in the existing dataframe.

The below code is used to extract the values from a dataframe.

for index,row in all_files_tickers.iterrows():
    ticker = row['ticker']
    print(index)
    print(ticker)
    try:
        #Grouped FUT tickers here# ABAN10OCTFUT AREVAT&D10SEPFUT MCDOWELL-N10NOVFUT TV-1810OCTFUT NFTYMCAP5011FEBFUT FTSE10012AUGFUT
        instrument_name,year,month,instrument_type = re.findall(r'([A-Z]{2,10}|[A-Z]{1,6}&[A-Z]{1,6}|[A-Z]{1,6}-[A-Z]{1,6}|[A-Z]{1,6}-[0-9]{1,5}|[A-Z]{1,9}[0-9]{1,2}|[A-Z]{1,6}&[A-Z]{1,6}[0-9]{1,3}|[A-Z]{1,6}[0-9]{1,3})(09|10|11|12|13|14|15|16|17|18|19|20|21)(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)(FUT)',ticker)[0]
    except:
        try:
            #Grouped other tickers here# ABAN10SEP700PA ABAN10SEP780CA AREVAT&D10SEP300CA DISHTV10SEP52.5CA MCDOWELL-N10SEP1550CA TV-1810SEP90CA
            instrument_name,year,month,strike_price,instrument_type = re.findall(r'([A-Z]{2,10}|[A-Z]{1,6}&[A-Z]{1,6}|[A-Z]{1,6}-[A-Z]{1,6}|[A-Z]{1,6}-[0-9]{1,5}|[A-Z]{1,9}[0-9]{1,2}|[A-Z]{1,6}&[A-Z]{1,6}[0-9]{1,3}|[A-Z]{1,6}[0-9]{1,3})(09|10|11|12|13|14|15|16|17|18|19|20|21)(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)([0-9]{1,5}|[0-9]{1,5}.[0-9]{1,3})(PA|CA)',ticker)[0]
        except:
            try:
                #Grouped option tickers here# BANKNIFTY10OCT10500PE BANKNIFTY10OCT11000CE ISPATIND11JAN22.5CE NFTYMCAP5010DEC2600PE S&P50011SEP1100PE
                instrument_name,year,month,strike_price,instrument_type = re.findall(r'([A-Z]{2,10}|[A-Z]{1,6}&[A-Z]{1,6}|[A-Z]{1,6}-[A-Z]{1,6}|[A-Z]{1,6}-[0-9]{1,5}|[A-Z]{1,9}[0-9]{1,2}|[A-Z]{1,6}&[A-Z]{1,6}[0-9]{1,3}|[A-Z]{1,6}[0-9]{1,3})(09|10|11|12|13|14|15|16|17|18|19|20|21)(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)([0-9]{1,5}|[0-9]{1,5}.[0-9]{1,3})(PE|CE)',ticker)[0]
            except:
                try:
                    #ss
                    instrument_name = re.findall(r'([a-z]{1,3})',ticker)[0]
                except:
                    try:
                        #S&P5001SEPFUT
                        instrument_name,year,month,instrument_type = re.findall(r'([A-Z]{1,6}&[A-Z]{1,6}[0-9]{1,3})(1)(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)(FUT)',ticker)[0]
                    except:
                        try:
                            #COALINDIA14130265CE
                            instrument_name,year,month,strike_price,instrument_type = re.findall(r'([A-Z]{2,10})(09|10|11|12|13|14|15|16|17|18|19|20|21)([0-9]{1,3})([0-9]{1,5})(PE|CE)',ticker)[0]
                        except:
                            try:
                                #FTSE100FEBFUT
                                instrument_name,month,instrument_type = re.findall(r'([A-Z]{1,6}[0-9]{1,3})(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)(FUT)',ticker)[0]
                            except:
                                #ADANIPOWER28FEB1927.5PE
                                instrument_name,date,month,strike_price,instrument_type = re.findall(r'([A-Z]{1,10})([0-9]{1,2})(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)([0-9]{1,5}.[0-9]{1,3})(CE|PE)',ticker)[0]

I have tried another method to extract values and save into the datafram but I didn't get the desired output.

def get_instrument_params(ticker):
    instrument_name= re.findall(r'([A-Z]{2,10}|[A-Z]{1,6}&[A-Z]{1,6}|[A-Z]{1,6}-[A-Z]{1,6}|[A-Z]{1,6}-[0-9]{1,5}|[A-Z]{1,9}[0-9]{1,2}|[A-Z]{1,6}&[A-Z]{1,6}[0-9]{1,3}|[A-Z]{1,6}[0-9]{1,3})',ticker)
    year= re.findall(r'(09|10|11|12|13|14|15|16|17|18|19|20|21)',ticker)
    month= re.findall(r'(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)',ticker)
    strike_price = re.findall(r'([0-9]{1,5}|[0-9]{1,5}.[0-9]{1,3})',ticker)
    instrument_type = re.findall(r'(CA|PA|PE|CE)',ticker)
    return " ".join(instrument_name,year,month,strike_price,instrument_type)
all_files_tickers['instrument_name','year','month','strike_price','instrument_type']=all_files_tickers['ticker'].apply(lambda x: get_instrument_params(x))
print(all_files_tickers)

I need the desired output like below:

Ticker instrument_name year month strike_price instrument_type
ABAN10OCTFUT ABAN 10 OCT - FUT
ABAN10SEP700PA ABAN 10 SEP 700 PA
ABAN10SEP720PA ABAN 10 SEP 720 PA
ABAN10SEP740PA ABAN 10 SEP 740 PA
ABAN10SEP760PA ABAN 10 SEP 760 PA

Thanks

0

There are 0 best solutions below