pyarrow not able to handle nulls for required fields when writing to parquet files

122 Views Asked by At

I am trying to create a pyarrow table and then write that into parquet files.

def test_pyarow():
    import pyarrow as pa
    import pyarrow.parquet
    import pandas as pd

    fields = [pa.field('id', pa.string(), nullable=False),
              pa.field('name', pa.string(), nullable=False)]
    array = [pa.array(['10', '11', '12', '13']),
             pa.array(['AAA', None, 'BBB', 'CCC'])]
    table = pa.Table.from_arrays(array, schema=pa.schema(fields))
    pyarrow.parquet.write_table(table, 'test_arrow.parquet', compression='SNAPPY', use_compliant_nested_type=True)
    df = pd.read_parquet("/Users/fki/Documents/git/Demo/bq_api/test_arrow.parquet", engine='pyarrow')
    print("\n\n\n")
    print(df)

when nullable is True:

   id  name
0  10   AAA
1  11  None
2  12   BBB
3  13   CCC

when nullable is False:

   id name
0  10  AAA
1  11  BBB
2  12  CCC
3  13  AAA
0

There are 0 best solutions below