When using load_dataset() to load a Mozilla Common Voice (v11) dataset, the resulting dataset (ds) has audio.arrays as numpy arrays. I don't know how to reproduce this.

How do you set just one feature as ndarrays?

In examining Common Voice:

> tt = load_dataset(
    data_args.dataset_name,
    data_args.dataset_config_name,
    split=f'{data_args.train_split_name}[:15%]',  # Load only the first %
    cache_dir=model_args.cache_dir,
    token=model_args.token,
)
> type(tt.select([0])['audio'][0]['path'])
<class 'str'>
> type(tt.select([0])['audio'][0]['array'])
<class 'numpy.ndarray'>
> type(tt.select([0])['path'][0]) # They repeat paths as a top level feature
<class 'str'>

But in my own code I can't store numpy arrays, EXCEPT I found ds = ds.with_format('np'), which does result in re-loaded datasets, but ALL top level features end up as numpy data types (see full code to test/reproduce below):

> type(test_ds['path'][0])
<class 'numpy.str_'>

I only need the "audio -> array" data to be 1d numpy arrays.

Here's test code to create a dataset and reload it to examine types:

#!/usr/bin/env python
# Trying to save and reload a numpy array to/from a huggingface dataset
# The type of the loaded array must be a numpy array()
from datasets import Dataset, Features, Array2D, Sequence, Value
import numpy as np

audio_arrays = [np.random.rand(16000), np.random.rand(16000)] 

features = Features({
  # Each audio contains a np array of audio data, and a path to the src audio file
  'audio': Sequence({
    #'array': Sequence(feature=Array2D(shape=(None,), dtype="float32")),
    'array': Sequence(feature=Value('float32')),
    'path': Value('string'),
  }),
  'path': Value('string'), # Path is redundant in common voice set also
})

ddata = {
    'path': [],        # This will be a list of strings
    'audio': [],       # This will be a list of dictionaries
}

ddata['path'] = ['/foo0/', '/bar0/'] # # ensures we see storage difference
ddata['audio'] = [
        {'array': audio_arrays[0], 'path': '/foo1/' },
        {'array': audio_arrays[1], 'path': '/bar1/', },
]
ds = Dataset.from_dict(ddata)
ds = ds.with_format('np')
ds.save_to_disk('/tmp/ds.ds') 

loaded_dataset = Dataset.load_from_disk('/tmp/ds.ds')
ld = loaded_dataset
au = ld['audio'][0]
ar = ld['audio'][0]['array']
print("Type of audio array:", type(ar))
print("Type of path:", type(ld['path'][0]))
print("Type of au path:", type(ld['audio'][0]['path']))
import ipdb; ipdb.set_trace(context=16); pass
1

There are 1 best solutions below

0
On

Got it. Output is:

{'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None)}
{'path': '/foo', 'array': array([0.31222534, 0.04180908, 0.84359741, ..., 0.01086426, 0.37417603,
0.14474487]), 'sampling_rate': 16000}
Saving the dataset (1/1 shards): 100%|███████████████████████████| 2/2 [00:00<00:00, 984.58 examples/s]
Type of path: <class 'str'>
Type of audio array: <class 'numpy.ndarray'>
Type of audio.path: <class 'str'>

Note, the audio arrays as bytes will error, with invalid format, when accessed. We're using soundfile to reformat them to proper complete WAV file bytes represenations.

Code to test/reproduce:

#!/usr/bin/env python
# Store audio snippets, without actual associated files, as dataset Audio() types.
# When loaded, the ['audio']['array'] items will be numpy ndarrays
from datasets import Dataset, Features, Array2D, Sequence, Value, Audio
import numpy as np
import sys
import soundfile as sf
import io

# Reference:
# Input: The Audio feature accepts as input:
#     A str: Absolute path to the audio file (i.e. random access is allowed).
#     A dict with the keys:
#         path: String with relative path of the audio file to the archive file.
#         bytes: Bytes content of the audio file.

# Convert the NumPy arrays to audio bytes in WAV format
def numpy_to_bytes(audio_array, sampling_rate=16000):
    with io.BytesIO() as bytes_io:
        sf.write(bytes_io, audio_array, samplerate=sampling_rate, format='WAV')
        return bytes_io.getvalue()

audio_arrays = [np.random.rand(10000).astype('float32'), np.random.rand(8300).astype('float32')]
audio_bytes = [numpy_to_bytes(audio_array) for audio_array in audio_arrays]

features = Features({
  'path': Value('string'), # Path is redundant in common voice set also
  'audio': Audio(sampling_rate=16000),
})

ddata = {
        'path': ['/foo', '/bar'],
        'audio': [
            {'bytes': audio_bytes[0], 'path': '/foo', },
            {'bytes': audio_bytes[1], 'path': '/bar', },
        ],
}
ds = Dataset.from_dict(ddata, features=features)
# ds = Dataset.from_dict(ddata).cast_column("audio", Audio())
# ds = ds.with_format('np')
print(ds.features)
print(ds[0]['audio'])
ds.save_to_disk('/tmp/ds.ds') 

loaded_dataset = Dataset.load_from_disk('/tmp/ds.ds')
ld = loaded_dataset[0]
au = ld['audio']
ar = ld['audio']['array']
print("Type of path:", type(ld['path']))
print("Type of audio array:", type(ar))
print("Type of audio array:", type(ar))
print("Type of audio.path:", type(ld['audio']['path']))
import ipdb; ipdb.set_trace(context=16); pass