i'm working on a speech recognition usign lstm model, i'm using the mini speech recognition command dataset, the tutorial given on tensorflow simple audio, is for classification and they used CNN, so i tried to change it to a speech recognition using LSTM,
i tried to change the shape of my data or to change my logic, but every time i tried so i get a different error, down is my code
import os
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
DATASET_PATH = 'data/mini_speech_commands'
data_dir = pathlib.Path(DATASET_PATH)
if not data_dir.exists():
tf.keras.utils.get_file(
'mini_speech_commands.zip',
origin="http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip",
extract=True,
cache_dir='.', cache_subdir='data')
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[(commands != 'README.md') & (commands != '.DS_Store')]
print('Commands:', commands)
def get_mfcc(audio):
stft = tf.signal.stft(audio, frame_length=400, frame_step=160, fft_length=512, pad_end=True)
spectrogram = tf.abs(stft) ** 2
num_mel_bins = 13
mel_filterbanks = tf.signal.linear_to_mel_weight_matrix(
num_mel_bins, 257, 16000, lower_edge_hertz=0.0, upper_edge_hertz=8000.0)
mel_spectrogram = tf.matmul(spectrogram, mel_filterbanks)
log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6) # Small constant added to avoid log(0)
log_mel_spectrogram = tf.image.resize_with_crop_or_pad(log_mel_spectrogram, max_sequence_length, num_mel_bins)
return log_mel_spectrogram
batch_size = 64
max_sequence_length = 100
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
directory=data_dir,
batch_size=batch_size,
validation_split=0.2,
seed=0,
output_sequence_length=max_sequence_length,
subset='both')
# Map the get_mfcc function for dataset preprocessing
def preprocess(audio, label):
mfccs = get_mfcc(audio)
return mfccs, label
train_ds = train_ds.map(preprocess, tf.data.AUTOTUNE)
val_ds = val_ds.map(preprocess, tf.data.AUTOTUNE)
num_labels = len(commands)
model = models.Sequential([
layers.Input(shape=(max_sequence_length, num_mel_bins)),
layers.Masking(mask_value=0.0), # Mask padding
layers.LSTM(128, return_sequences=True),
layers.LSTM(128),
layers.Dense(num_labels, activation='softmax')
])
model.summary()
model.compile(
optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy']
)
# Training
EPOCHS = 10
history = model.fit(
train_ds,
validation_data=val_ds,
epochs=EPOCHS,
callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2)
)
when running the code i get
ValueError: in user code:
File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1338, in train_function *
return step_function(self, iterator)
File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1322, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1303, in run_step **
outputs = model.train_step(data)
File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1080, in train_step
y_pred = self(x, training=True)
File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/input_spec.py", line 235, in assert_input_compatibility
raise ValueError(
ValueError: Exception encountered when calling layer 'sequential_6' (type Sequential).
Input 0 of layer "lstm_12" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 100, 13, 13)
Call arguments received by layer 'sequential_6' (type Sequential):
• inputs=tf.Tensor(shape=(None, 100, 13, 13), dtype=float32)
• training=True
• mask=None
i don't know why i get this error, is there someone who can help me.