Reading multiple Polars dataframes from a single CSV file

328 Views Asked by At

I have a CSV file that has multiple tables of data, each one with a header row and some number of data rows, and then a line break before the next one. I'd like to make a list of Polars dataframes from this.

I couldn't find how to do this without using Python's text-file parsing. It seems like scan_csv covers a similar case, but that requires many different CSVs and I'm not sure where to look to make it work on one.

1

There are 1 best solutions below

2
On

It's pretty manual. I've done something similar so here it is.

csvlf=(
    pl.scan_csv('multdf.csv', 
                has_header=False, # no col names 
                infer_schema_length=0, # everything to utf types
                separator=chr(0) # ignore commas at first, will split later, 
                # this allows for each child df to have a different number of columns
                )
    .with_columns(i=pl.int_range(0,pl.count())) # make a row index
)

# find breakpoints to loop through
breakpoints = [-1] # start with -1 because each chunk begins after a breakpoint 
breakpoints.extend(
    csvlf
    .filter(
        #empty lines may have comma placeholders or not
        (pl.col('column_1').is_null()) | 
        (pl.col('column_1').str.replace_all(',','').str.len_chars()==0)
        )
    .select('i')
    .collect()
    .to_series().to_list()
    )
breakpoints.append(csvlf.select(pl.count()).collect().item())

# make a helper function to rename and cast chunks
def make_df(df):
    for col in df.columns:
        if df.filter(pl.col(col)!='').shape[0]==0:
            df=df.drop(col)
            continue
        try:
            df = df.with_columns(pl.col(col).cast(pl.Int64()))
        except:
            try:
                df = df.with_columns(pl.col(col).cast(pl.Float64()))
            except:
                pass
    dfcols = next(df.iter_rows())
    df = df[1:]
    # new column name will have it's original position 
    # appended to it to protect against duplicate names
    df.columns=[f"{x}_{i}" for i, x in enumerate(dfcols)]
    return df

# loop through the chunks and make dfs list
dfs=[]
for i in range(len(breakpoints)-1):
    dfs.append(
    csvlf
    .filter(pl.col('i').is_between(breakpoints[i]+1,breakpoints[i+1]-1))
    .select(
        pl.col('column_1').str.split(',').list.to_struct()
        )
    .unnest('column_1')
    .collect()
    .pipe(make_df)
)