break down pandas dataframe column into multiple columns in a single df

1.2k Views Asked by At

I have a dataframe with index, features and time data, but the time data is in one column like this:

id date feature

1 date1 feature1

2 date2 feature2

1 date2 feature3

I want to transform it into this:

id date feature

1 date1 feature1 date2 feature3

2 date2 feature2 NaN NaN

Already did this by explicitly defining dataframes, queries and joins, but failed to find a dynamic way. What I wrote:

df = pd.read_excel('some path')

import pandas as pd

list1 = []
list2 = []
list3 = []

def placeholder_lists():
    for i in range(7):
        if len(str(i)) == 1:
            if i not in [8,9]:
                i = "0"+str(i+3)
            else:
                i = str(i+3)
        else:
            i = str(i+3)
        list1.append(i)

    for l in range(7):
        if len(str(l)) == 1:
            if l not in [10,9]:
                l = "0"+str(l+2)
            else:
                l = str(l+2)
        else:
            l = str(l+2)
        list2.append(l)

    for g in range(7):
        if len(str(g)) == 1:
            if g not in [9,8]:
                g = "0"+str(g+1)
            else:
                g = str(g+1)
        else:
            g = str(g+1)
        list3.append(g)

placeholder_lists()

for m,n,u in zip(list1,list2, list3):
    df01 = df.query('dw_creation_date == "01-AUG-17" ')
    e = str(u)+"-AUG-17"

    currentdf = df.query('dw_creation_date == "%s"' % e)

    if 1 == "01":
        currentdf = df01
    first = "df"+m
    second = "df"+n
    listie = range(50)
    first = second.join(currentdf.set_index('unique_identifier'), on='unique_identifier', lsuffix = listie[n])

... And the error I get:

first = second.join(currentdf.set_index('unique_identifier'), lsuffix = listie[n])

TypeError: join() takes no keyword arguments

Any ideas?

1

There are 1 best solutions below

0
On BEST ANSWER
cols = ['id','date','feature']
df = pd.DataFrame({'date': {0: 'date1', 1: 'date2', 2: 'date2'}, 
                   'id': {0: 1, 1: 2, 2: 1}, 
                  'feature': {0: 'feature1', 1: 'feature2', 2: 'feature3'}}, columns=cols)

print (df)
   id   date   feature
0   1  date1  feature1
1   2  date2  feature2
2   1  date2  feature3

You can groupby by id and apply new df. Then reshape by unstack and sort columns by sort_index in second level of Multiindex.

Last flattening Multiindex in columns and reset_index.

df = df.groupby('id')['date','feature'] \
       .apply(lambda x: pd.DataFrame(x.values, columns=['feature','date'])) \
       .unstack() \
       .sort_index(1, level=1)

print (df)
   feature      date feature      date
         0         0       1         1
id                                    
1    date1  feature1   date2  feature3
2    date2  feature2    None      None


df.columns = ['{0[0]}_{0[1]}'.format(x)  for x in df.columns]
df = df.reset_index()
print (df)
   id feature_0    date_0 feature_1    date_1
0   1     date1  feature1     date2  feature3
1   2     date2  feature2      None      None