I am making a voice assistant using Python 3.12 on MacOS. Now, i am at voice recognition step, but it needs like 6 seconds for just recognize my speech and say 'How can i help you' to user. I know, that would be better to use pydub for silence detection, but now i have the newest Python version, and about 90% of libraries currently don't support my Python version, and because of that, i made silence detection from scratch using NumPy and SciPy. It should be noted, that most of libraries that can help making voice assistant, don't support MacOS (i use MacOS Monterey).
Should i optimize my code by removing class AudioRecorder and switching to older, but more popular version, and continue working in Python 3.10, for example, or should i rewrite code in more effective way, or my hardware just can't normally handle this program? Here are my specifications:
OS: macOS 12.6.6 21G646 x86_64
Host: MacBookPro 12,1
Shell: zsh 5.8.1
CPU: Intel i5-5287U (4) @ 2.90GHz
GPU: Intel Iris Graphics 6100
Memory: 16384MiB
Also, here is my code:
import vosk
import json
from pyaudio import *
import numpy as np
import time
import gtts
from pydub import AudioSegment
from pydub.playback import play
class AudioRecorder():
def __init__(self, max_silence_duration=1500, sample_rate=16000, chunk_size=1024,
silence_thresh=1000):
self.max_silence_duration = max_silence_duration
self.sample_rate = sample_rate
self.chunk_size = chunk_size
self.silence_thresh = silence_thresh
self._frames = b''
self._start_time = None
self.stream = None
self.p = None
def add_data_to_frames(self, new_data):
self._frames += new_data
def is_silence(self, audio_chunk, silence_thresh=-300, energy_thresh=1):
energy = np.sum(audio_chunk.astype(np.float32)
** 2) / float(len(audio_chunk))
return energy < energy_thresh or np.max(np.abs(audio_chunk)) < silence_thresh
def record_audio(self, max_silence_duration=1500, sample_rate=16000, chunk_size=1024, silence_thresh=1000):
self.p = PyAudio()
self.stream = self.p.open(format=paInt16,
channels=1,
rate=sample_rate,
input=True,
frames_per_buffer=chunk_size)
self._frames = b''
self._start_time = time.time()
print("Recording...")
try:
while True:
data = self.stream.read(chunk_size)
audio_chunk = np.frombuffer(data, dtype=np.int16)
self.add_data_to_frames(new_data=data)
if not self.is_silence(audio_chunk, silence_thresh=silence_thresh):
self._start_time = time.time()
elif time.time() - self._start_time >= max_silence_duration / 1000.0:
break
except KeyboardInterrupt:
print("Recording interrupted.")
finally:
print("Recording finished.")
if self.stream and self.stream.is_active():
self.stream.stop_stream()
self.stream.close()
print("Audiorecognizer.stream closed.")
if self.p:
self.p.terminate()
print("PyAudio terminated.")
return self._frames, self.p, self.stream
def recognize_speech(self, audio_data):
path_to_model = "/Users/dimapogarskiy/vosk-model-en-us-0.22-lgraph"
model = vosk.Model(path_to_model)
recognizer = vosk.KaldiRecognizer(model, 16000)
recognizer.AcceptWaveform(audio_data)
result = json.loads(recognizer.Result())
recognized_text = result.get('text', '')
return recognized_text
def speak(text: str):
tts = gtts.gTTS(text, lang='en')
tts.save('hello.wav')
song = AudioSegment.from_file("hello.wav")
return play(song)
if __name__ == "__main__":
recognizer = AudioRecorder()
try:
speak('How can I help you?')
audio_recorder_data, p, stream = recognizer.record_audio()
recognized_text = recognizer.recognize_speech(
audio_data=audio_recorder_data)
print("Recognized text:", recognized_text)
if 'hello' in recognized_text or 'hi' in recognized_text:
speak('Hello sir')
except KeyboardInterrupt:
print("Program interrupted. Cleaning up...")
finally:
if 'stream' in locals() and recognizer.stream is not None and
recognizer.stream.is_active():
recognizer.stream.stop_stream()
recognizer.stream.close()
print("Audio stream closed.")
if 'p' in locals():
recognizer.p.terminate()
print("PyAudio terminated.").
I tried using pydub and librosa for silence detection, but only making it fro scratch was a good idea. The problem is, that this method loads the CPU much more than other libraries. Please, help me.