So I have been working with some data that includes both electrons and muons. Now for this we have to find opposite-sign same-flavour dilepton pairs, i.e. e+e- or mu+mu-. However, I run into problems after I find these pairs, concatenate them into a single dilepton array, then try and save the resulting array in a parquet file. Note I am using Awkward 1.10.3.
A simple example shows the problem that I am running into:
import awkward as ak
import vector
vector.register_awkward()
elec = ak.Array([[{'pt' : 1, 'phi' : 2.2, 'eta' : 1.5, 'mass' : 1, 'MVAid' : True}, {'pt' : 1, 'phi' : 2.2, 'eta' : 1.5, 'mass' : 1, 'MVAid' : True}]])
mu = ak.Array([[{'pt' : 2, 'phi' : 2.2, 'eta' : 2.5, 'mass' : 2, 'tightId' : True}, {'pt' : 2, 'phi' : 2.2, 'eta' : 2.5, 'mass' : 2, 'tightId' : True}]])
electrons_4V = ak.Array(elec, with_name = "Momentum4D")
ee_pairs = ak.combinations(electrons_4V, 2, fields = ["LeadLepton", "SubleadLepton"])
muons_4V = ak.Array(mu, with_name = "Momentum4D")
mm_pairs = ak.combinations(muons_4V, 2, fields = ["LeadLepton", "SubleadLepton"])
dileptons = ak.concatenate([ee_pairs, mm_pairs], axis = 1)
ak.to_parquet(dileptons, 'test.parquet')
This then leads to the error:
---------------------------------------------------------------------------
ArrowNotImplementedError Traceback (most recent call last)
Cell In[1], line 20
17 dileptons = ak.concatenate([ee_pairs, mm_pairs], axis = 1)
18 dileptons['Dilepton'] = dileptons.LeadLepton + dileptons.SubleadLepton
---> 20 ak.to_parquet(dileptons, 'test.parquet')
File ~/miniconda3/envs/idm/lib/python3.10/site-packages/awkward/operations/ak_to_parquet.py:295, in to_parquet(array, destination, list_to32, string_to32, bytestring_to32, emptyarray_to, categorical_as_dictionary, extensionarray, count_nulls, compression, compression_level, row_group_size, data_page_size, parquet_flavor, parquet_version, parquet_page_version, parquet_metadata_statistics, parquet_dictionary_encoding, parquet_byte_stream_split, parquet_coerce_timestamps, parquet_old_int96_timestamps, parquet_compliant_nested, parquet_extra_options, storage_options)
293 fs, destination = fsspec.core.url_to_fs(destination, **(storage_options or {}))
294 metalist = []
--> 295 with pyarrow_parquet.ParquetWriter(
296 destination,
297 table.schema,
298 filesystem=fs,
299 flavor=parquet_flavor,
300 version=parquet_version,
301 use_dictionary=parquet_dictionary_encoding,
302 compression=compression,
303 write_statistics=parquet_metadata_statistics,
304 use_deprecated_int96_timestamps=parquet_old_int96_timestamps,
305 compression_level=compression_level,
306 use_byte_stream_split=parquet_byte_stream_split,
307 data_page_version=parquet_page_version,
308 use_compliant_nested_type=parquet_compliant_nested,
309 data_page_size=data_page_size,
310 coerce_timestamps=parquet_coerce_timestamps,
311 metadata_collector=metalist,
312 **parquet_extra_options,
313 ) as writer:
314 writer.write_table(table, row_group_size=row_group_size)
315 meta = metalist[0]
File ~/miniconda3/envs/idm/lib/python3.10/site-packages/pyarrow/parquet/core.py:1001, in ParquetWriter.__init__(self, where, schema, filesystem, flavor, version, use_dictionary, compression, write_statistics, use_deprecated_int96_timestamps, compression_level, use_byte_stream_split, column_encoding, writer_engine_version, data_page_version, use_compliant_nested_type, encryption_properties, write_batch_size, dictionary_pagesize_limit, store_schema, **options)
999 self._metadata_collector = options.pop('metadata_collector', None)
1000 engine_version = 'V2'
-> 1001 self.writer = _parquet.ParquetWriter(
1002 sink, schema,
1003 version=version,
1004 compression=compression,
1005 use_dictionary=use_dictionary,
1006 write_statistics=write_statistics,
1007 use_deprecated_int96_timestamps=use_deprecated_int96_timestamps,
1008 compression_level=compression_level,
1009 use_byte_stream_split=use_byte_stream_split,
1010 column_encoding=column_encoding,
1011 writer_engine_version=engine_version,
1012 data_page_version=data_page_version,
1013 use_compliant_nested_type=use_compliant_nested_type,
1014 encryption_properties=encryption_properties,
1015 write_batch_size=write_batch_size,
1016 dictionary_pagesize_limit=dictionary_pagesize_limit,
1017 store_schema=store_schema,
1018 **options)
1019 self.is_open = True
File ~/miniconda3/envs/idm/lib/python3.10/site-packages/pyarrow/_parquet.pyx:1754, in pyarrow._parquet.ParquetWriter.__cinit__()
File ~/miniconda3/envs/idm/lib/python3.10/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
File ~/miniconda3/envs/idm/lib/python3.10/site-packages/pyarrow/error.pxi:121, in pyarrow.lib.check_status()
ArrowNotImplementedError: Unhandled type for Arrow to Parquet schema conversion: dense_union<0: extension> not null=0, 1: extension> not null=1>
This seems like it could also be an Arrow issue, and not an Awkward at all. It also seems that the issue comes down to the fact that the elec array and the mu array have different fields (MVAid for elec and tightId for mu) which is causing dileptons to be a union array, but I'm unsure if there is any other way to move forward to get this to save, or just keep them separated.
Also, if I don't include any of the vector stuff I still get the same error:
import awkward as ak
elec = ak.Array([[{'pt' : 1, 'phi' : 2.2, 'eta' : 1.5, 'mass' : 1, 'MVAid' : True}, {'pt' : 1, 'phi' : 2.2, 'eta' : 1.5, 'mass' : 1, 'MVAid' : True}]])
mu = ak.Array([[{'pt' : 2, 'phi' : 2.2, 'eta' : 2.5, 'mass' : 2, 'tightId' : True}, {'pt' : 2, 'phi' : 2.2, 'eta' : 2.5, 'mass' : 2, 'tightId' : True}]])
ee_pairs = ak.combinations(elec, 2, fields = ["LeadLepton", "SubleadLepton"])
mm_pairs = ak.combinations(mu, 2, fields = ["LeadLepton", "SubleadLepton"])
dileptons = ak.concatenate([ee_pairs, mm_pairs], axis = 1)
ak.to_parquet(dileptons, 'test.parquet')
Thanks!
T