I am using jupyter/scipy-notebook, with dask version:
>>> dask.__version__
'2023.9.3'
When reading this parquet file, using dask.dataframe.read_parquet, I get an error.
mme = dask.dataframe.read_parquet(mme_path, storage_options=S3_CREDENTIALS)
returns the following error:
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
File /opt/conda/lib/python3.11/site-packages/dask/backends.py:136, in CreationDispatch.register_inplace.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
135 try:
--> 136 return func(*args, **kwargs)
137 except Exception as e:
File /opt/conda/lib/python3.11/site-packages/dask/dataframe/io/parquet/core.py:543, in read_parquet(path, columns, filters, categories, index, storage_options, engine, use_nullable_dtypes, dtype_backend, calculate_divisions, ignore_metadata_file, metadata_task_size, split_row_groups, blocksize, aggregate_files, parquet_file_extension, filesystem, **kwargs)
541 blocksize = None
--> 543 read_metadata_result = engine.read_metadata(
544 fs,
545 paths,
546 categories=categories,
547 index=index,
548 use_nullable_dtypes=use_nullable_dtypes,
549 dtype_backend=dtype_backend,
550 gather_statistics=calculate_divisions,
551 filters=filters,
552 split_row_groups=split_row_groups,
553 blocksize=blocksize,
554 aggregate_files=aggregate_files,
555 ignore_metadata_file=ignore_metadata_file,
556 metadata_task_size=metadata_task_size,
557 parquet_file_extension=parquet_file_extension,
558 dataset=dataset_options,
559 read=read_options,
560 **other_options,
561 )
563 # In the future, we may want to give the engine the
564 # option to return a dedicated element for `common_kwargs`.
565 # However, to avoid breaking the API, we just embed this
566 # data in the first element of `parts` for now.
567 # The logic below is inteded to handle backward and forward
568 # compatibility with a user-defined engine.
File /opt/conda/lib/python3.11/site-packages/dask/dataframe/io/parquet/arrow.py:532, in ArrowDatasetEngine.read_metadata(cls, fs, paths, categories, index, use_nullable_dtypes, dtype_backend, gather_statistics, filters, split_row_groups, blocksize, aggregate_files, ignore_metadata_file, metadata_task_size, parquet_file_extension, **kwargs)
531 # Stage 1: Collect general dataset information
--> 532 dataset_info = cls._collect_dataset_info(
533 paths,
534 fs,
535 categories,
536 index,
537 gather_statistics,
538 filters,
539 split_row_groups,
540 blocksize,
541 aggregate_files,
542 ignore_metadata_file,
543 metadata_task_size,
544 parquet_file_extension,
545 kwargs,
546 )
548 # Stage 2: Generate output `meta`
File /opt/conda/lib/python3.11/site-packages/dask/dataframe/io/parquet/arrow.py:1047, in ArrowDatasetEngine._collect_dataset_info(cls, paths, fs, categories, index, gather_statistics, filters, split_row_groups, blocksize, aggregate_files, ignore_metadata_file, metadata_task_size, parquet_file_extension, kwargs)
1046 if ds is None:
-> 1047 ds = pa_ds.dataset(
1048 paths,
1049 filesystem=_wrapped_fs(fs),
1050 **_processed_dataset_kwargs,
1051 )
1053 # Get file_frag sample and extract physical_schema
File /opt/conda/lib/python3.11/site-packages/pyarrow/dataset.py:776, in dataset(source, schema, format, filesystem, partitioning, partition_base_dir, exclude_invalid_files, ignore_prefixes)
775 if all(_is_path_like(elem) for elem in source):
--> 776 return _filesystem_dataset(source, **kwargs)
777 elif all(isinstance(elem, Dataset) for elem in source):
File /opt/conda/lib/python3.11/site-packages/pyarrow/dataset.py:466, in _filesystem_dataset(source, schema, filesystem, partitioning, format, partition_base_dir, exclude_invalid_files, selector_ignore_prefixes)
464 factory = FileSystemDatasetFactory(fs, paths_or_selector, format, options)
--> 466 return factory.finish(schema)
File /opt/conda/lib/python3.11/site-packages/pyarrow/_dataset.pyx:2941, in pyarrow._dataset.DatasetFactory.finish()
File /opt/conda/lib/python3.11/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
File /opt/conda/lib/python3.11/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()
ArrowInvalid: Unable to merge: Field dt has incompatible types: string vs int32
The above exception was the direct cause of the following exception:
ArrowInvalid Traceback (most recent call last)
Cell In[51], line 1
----> 1 mme = dask.dataframe.read_parquet(
2 mme_path, storage_options=S3_CREDENTIALS)
File /opt/conda/lib/python3.11/site-packages/dask/backends.py:138, in CreationDispatch.register_inplace.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
136 return func(*args, **kwargs)
137 except Exception as e:
--> 138 raise type(e)(
139 f"An error occurred while calling the {funcname(func)} "
140 f"method registered to the {self.backend} backend.\n"
141 f"Original Message: {e}"
142 ) from e
ArrowInvalid: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: Unable to merge: Field dt has incompatible types: string vs int32
Reading the same file using pandas works:
>>> mme = pd.read_parquet(mme_path, storage_options=S3_CREDENTIALS)
>>> mme.dt.dtype
dtype('O')
>>> pd.__version__
'2.1.1'
I'm sorry but I cannot share this parquet file, but it looks like there's something peculiar about it. I need to use dask because this file is just one of many that need to be loaded.