RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 2, 160000]

166 Views Asked by At

I have kind of torch train script for transcribe the audio files. I have 2 of audio files and one of them has [1 1 16000]shape and the other one [1 1 16000].Both files are 10sec. I coulnt handle the error. Please if You know anything about audios maybe it can help.

files are in a path and. before here i parse them like 10sec+10sec+10sec+10sec+...+10sec if u want to see i can share full of code. when i run code with "zorlu.m4a" it is working. But when i run code with bell.m4a it gives me error.

"""------------------------------blocks of code--------------------"""
# Define the path where the .wav files are located
wav_directory = os.getcwd()+"/wav"  # Add your wav dir

# Define the output file name
output_file = os.path.join(wav_directory, "metadata.txt")

# Define the range of .wav files (1 to 165)
# Change the 165 to what ever amount of wav files you have if 100 the change it to 101
wav_files_range = range(1, int(len(ses)/parca_uzunlugu+1))

# Initialize the list to store file paths and transcripts
file_and_transcripts = []

# Initialize the wav2vec model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

# Iterate through the .wav files
for i in wav_files_range:
    wav_file = os.path.join(wav_directory, f"{i}.wav")

    # Check if the .wav file exists
    if os.path.exists(wav_file):
        # Recognize the speech in the .wav file
        try:
            waveform, sample_rate = torchaudio.load(wav_file)
            print("waveform",type(waveform),waveform.shape)
            waveform = waveform.squeeze()  # Squeeze the batch dimension
            print("waveform",type(waveform),waveform.shape)
            resampler = torchaudio.transforms.Resample(
                orig_freq=sample_rate, new_freq=16000)
            print("waveform",type(waveform),waveform.shape)
            waveform = resampler(waveform)
            print("waveform",type(waveform),waveform.shape)
            input_values = processor(
                waveform, return_tensors="pt", sampling_rate=16000).input_values
            print("waveform",type(waveform),waveform.shape)
            logits = model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcript = processor.decode(predicted_ids[0])
        except FileNotFoundError:
            print(f"File not found: {wav_file}")
            continue

        # Append the desired path format and transcript to the list
        file_and_transcripts.append(
            f"{wav_directory}/{i}.wav|{transcript.capitalize()}.|{transcript.capitalize()}.")
    else:
        print(f"File not found: {wav_file}")

# Write the file paths and transcripts to the output file
with open(output_file, "w") as f:
    for line in file_and_transcripts:
        f.write(f"{line}\n")

print(f"File '{output_file}' created successfully.")
shutil.move(wav_directory+'/metadata.txt',wav_directory[:-4]+'/metadata.txt')


"""------------------------------blocks of code--------------------"""


Erors:
waveform <class 'torch.Tensor'> torch.Size([2, 441000])
waveform <class 'torch.Tensor'> torch.Size([2, 441000])
waveform <class 'torch.Tensor'> torch.Size([2, 441000])
waveform <class 'torch.Tensor'> torch.Size([2, 160000])
waveform <class 'torch.Tensor'> torch.Size([2, 160000])
--------i add a print line on torch/nn/modules/conv.py line 307------- <class 'torch.Tensor'> torch.Size([1, 1, 2, 160000])
Traceback (most recent call last):
  File "/home/yunus/Desktop/piper/datasets/preprocess.py", line 87, in <module>
    logits = model(input_values).logits
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 1962, in forward
    outputs = self.wav2vec2(
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 1547, in forward
    extract_features = self.feature_extractor(input_values)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 459, in forward
    hidden_states = conv_layer(hidden_states)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py", line 362, in forward
    hidden_states = self.conv(hidden_states)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 311, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/home/yunus/anaconda3/envs/ljpreprocess/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 307, in _conv_forward
    return F.conv1d(input, weight, bias, self.stride,
RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 2, 160000]```
0

There are 0 best solutions below