Why Python voice assistant works so slow?

44 Views Asked by At

I am making a voice assistant using Python 3.12 on MacOS. Now, i am at voice recognition step, but it needs like 6 seconds for just recognize my speech and say 'How can i help you' to user. I know, that would be better to use pydub for silence detection, but now i have the newest Python version, and about 90% of libraries currently don't support my Python version, and because of that, i made silence detection from scratch using NumPy and SciPy. It should be noted, that most of libraries that can help making voice assistant, don't support MacOS (i use MacOS Monterey). Should i optimize my code by removing class AudioRecorder and switching to older, but more popular version, and continue working in Python 3.10, for example, or should i rewrite code in more effective way, or my hardware just can't normally handle this program? Here are my specifications:

OS: macOS 12.6.6 21G646 x86_64 
Host: MacBookPro 12,1 
Shell: zsh 5.8.1 
CPU: Intel i5-5287U (4) @ 2.90GHz 
GPU: Intel Iris Graphics 6100 
Memory: 16384MiB 

Also, here is my code:

import vosk
import json
from pyaudio import *
import numpy as np
import time
import gtts
from pydub import AudioSegment
from pydub.playback import play


class AudioRecorder():
    def __init__(self, max_silence_duration=1500, sample_rate=16000, chunk_size=1024,       
                 silence_thresh=1000):
        self.max_silence_duration = max_silence_duration
        self.sample_rate = sample_rate
        self.chunk_size = chunk_size
        self.silence_thresh = silence_thresh
        self._frames = b''
        self._start_time = None
        self.stream = None
        self.p = None

    def add_data_to_frames(self, new_data):
        self._frames += new_data

    def is_silence(self, audio_chunk, silence_thresh=-300, energy_thresh=1):
        energy = np.sum(audio_chunk.astype(np.float32)
                        ** 2) / float(len(audio_chunk))
        return energy < energy_thresh or np.max(np.abs(audio_chunk)) < silence_thresh

    def record_audio(self, max_silence_duration=1500, sample_rate=16000, chunk_size=1024, silence_thresh=1000):
        self.p = PyAudio()
        self.stream = self.p.open(format=paInt16,
                                  channels=1,
                                  rate=sample_rate,
                                  input=True,
                                  frames_per_buffer=chunk_size)

        self._frames = b''
        self._start_time = time.time()

        print("Recording...")

        try:
            while True:
                data = self.stream.read(chunk_size)
                audio_chunk = np.frombuffer(data, dtype=np.int16)

                self.add_data_to_frames(new_data=data)

                if not self.is_silence(audio_chunk, silence_thresh=silence_thresh):
                    self._start_time = time.time()
                elif time.time() - self._start_time >= max_silence_duration / 1000.0:
                    break

        except KeyboardInterrupt:
            print("Recording interrupted.")

        finally:
            print("Recording finished.")
            if self.stream and self.stream.is_active():
                self.stream.stop_stream()
                self.stream.close()
                print("Audiorecognizer.stream closed.")

            if self.p:
                self.p.terminate()
                print("PyAudio terminated.")

        return self._frames, self.p, self.stream

    def recognize_speech(self, audio_data):
        path_to_model = "/Users/dimapogarskiy/vosk-model-en-us-0.22-lgraph"
        model = vosk.Model(path_to_model)
        recognizer = vosk.KaldiRecognizer(model, 16000)
        recognizer.AcceptWaveform(audio_data)

        result = json.loads(recognizer.Result())
        recognized_text = result.get('text', '')

        return recognized_text


def speak(text: str):
    tts = gtts.gTTS(text, lang='en')
    tts.save('hello.wav')
    song = AudioSegment.from_file("hello.wav")

    return play(song)


if __name__ == "__main__":
    recognizer = AudioRecorder()

    try:
        speak('How can I help you?')
        audio_recorder_data, p, stream = recognizer.record_audio()
        recognized_text = recognizer.recognize_speech(
            audio_data=audio_recorder_data)
        print("Recognized text:", recognized_text)


        if 'hello' in recognized_text or 'hi' in recognized_text:
            speak('Hello sir')

    except KeyboardInterrupt:
        print("Program interrupted. Cleaning up...")

    finally:
        if 'stream' in locals() and recognizer.stream is not None and 
            recognizer.stream.is_active():

            recognizer.stream.stop_stream()
            recognizer.stream.close()
            print("Audio stream closed.")

        if 'p' in locals():
            recognizer.p.terminate()
            print("PyAudio terminated."). 

I tried using pydub and librosa for silence detection, but only making it fro scratch was a good idea. The problem is, that this method loads the CPU much more than other libraries. Please, help me.

0

There are 0 best solutions below