Error about field having incompatible type when reading parquet with dask.dataframe.read_parquet

467 Views Asked by At

I am using jupyter/scipy-notebook, with dask version:

>>> dask.__version__
'2023.9.3'

When reading this parquet file, using dask.dataframe.read_parquet, I get an error.

mme = dask.dataframe.read_parquet(mme_path, storage_options=S3_CREDENTIALS)

returns the following error:

---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/dask/backends.py:136, in CreationDispatch.register_inplace.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    135 try:
--> 136     return func(*args, **kwargs)
    137 except Exception as e:

File /opt/conda/lib/python3.11/site-packages/dask/dataframe/io/parquet/core.py:543, in read_parquet(path, columns, filters, categories, index, storage_options, engine, use_nullable_dtypes, dtype_backend, calculate_divisions, ignore_metadata_file, metadata_task_size, split_row_groups, blocksize, aggregate_files, parquet_file_extension, filesystem, **kwargs)
    541     blocksize = None
--> 543 read_metadata_result = engine.read_metadata(
    544     fs,
    545     paths,
    546     categories=categories,
    547     index=index,
    548     use_nullable_dtypes=use_nullable_dtypes,
    549     dtype_backend=dtype_backend,
    550     gather_statistics=calculate_divisions,
    551     filters=filters,
    552     split_row_groups=split_row_groups,
    553     blocksize=blocksize,
    554     aggregate_files=aggregate_files,
    555     ignore_metadata_file=ignore_metadata_file,
    556     metadata_task_size=metadata_task_size,
    557     parquet_file_extension=parquet_file_extension,
    558     dataset=dataset_options,
    559     read=read_options,
    560     **other_options,
    561 )
    563 # In the future, we may want to give the engine the
    564 # option to return a dedicated element for `common_kwargs`.
    565 # However, to avoid breaking the API, we just embed this
    566 # data in the first element of `parts` for now.
    567 # The logic below is inteded to handle backward and forward
    568 # compatibility with a user-defined engine.

File /opt/conda/lib/python3.11/site-packages/dask/dataframe/io/parquet/arrow.py:532, in ArrowDatasetEngine.read_metadata(cls, fs, paths, categories, index, use_nullable_dtypes, dtype_backend, gather_statistics, filters, split_row_groups, blocksize, aggregate_files, ignore_metadata_file, metadata_task_size, parquet_file_extension, **kwargs)
    531 # Stage 1: Collect general dataset information
--> 532 dataset_info = cls._collect_dataset_info(
    533     paths,
    534     fs,
    535     categories,
    536     index,
    537     gather_statistics,
    538     filters,
    539     split_row_groups,
    540     blocksize,
    541     aggregate_files,
    542     ignore_metadata_file,
    543     metadata_task_size,
    544     parquet_file_extension,
    545     kwargs,
    546 )
    548 # Stage 2: Generate output `meta`

File /opt/conda/lib/python3.11/site-packages/dask/dataframe/io/parquet/arrow.py:1047, in ArrowDatasetEngine._collect_dataset_info(cls, paths, fs, categories, index, gather_statistics, filters, split_row_groups, blocksize, aggregate_files, ignore_metadata_file, metadata_task_size, parquet_file_extension, kwargs)
   1046 if ds is None:
-> 1047     ds = pa_ds.dataset(
   1048         paths,
   1049         filesystem=_wrapped_fs(fs),
   1050         **_processed_dataset_kwargs,
   1051     )
   1053 # Get file_frag sample and extract physical_schema

File /opt/conda/lib/python3.11/site-packages/pyarrow/dataset.py:776, in dataset(source, schema, format, filesystem, partitioning, partition_base_dir, exclude_invalid_files, ignore_prefixes)
    775 if all(_is_path_like(elem) for elem in source):
--> 776     return _filesystem_dataset(source, **kwargs)
    777 elif all(isinstance(elem, Dataset) for elem in source):

File /opt/conda/lib/python3.11/site-packages/pyarrow/dataset.py:466, in _filesystem_dataset(source, schema, filesystem, partitioning, format, partition_base_dir, exclude_invalid_files, selector_ignore_prefixes)
    464 factory = FileSystemDatasetFactory(fs, paths_or_selector, format, options)
--> 466 return factory.finish(schema)

File /opt/conda/lib/python3.11/site-packages/pyarrow/_dataset.pyx:2941, in pyarrow._dataset.DatasetFactory.finish()

File /opt/conda/lib/python3.11/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File /opt/conda/lib/python3.11/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()

ArrowInvalid: Unable to merge: Field dt has incompatible types: string vs int32

The above exception was the direct cause of the following exception:

ArrowInvalid                              Traceback (most recent call last)
Cell In[51], line 1
----> 1 mme = dask.dataframe.read_parquet(
      2     mme_path, storage_options=S3_CREDENTIALS)

File /opt/conda/lib/python3.11/site-packages/dask/backends.py:138, in CreationDispatch.register_inplace.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    136     return func(*args, **kwargs)
    137 except Exception as e:
--> 138     raise type(e)(
    139         f"An error occurred while calling the {funcname(func)} "
    140         f"method registered to the {self.backend} backend.\n"
    141         f"Original Message: {e}"
    142     ) from e

ArrowInvalid: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: Unable to merge: Field dt has incompatible types: string vs int32

Reading the same file using pandas works:

>>> mme = pd.read_parquet(mme_path, storage_options=S3_CREDENTIALS)
>>> mme.dt.dtype
dtype('O')

>>> pd.__version__
'2.1.1'

I'm sorry but I cannot share this parquet file, but it looks like there's something peculiar about it. I need to use dask because this file is just one of many that need to be loaded.

0

There are 0 best solutions below