SpeechRecognition listen_in_background gives no callbacks until pyttsx3 is finished

75 Views Asked by At

Newbie on this forum so be kind! I might just be blind to the issue at this point but I can't for the life of me figure out why I can't interrupt my speaking AI when it's rambling. I've tried a bunch of different approaches and now I'm finally at threading but even though playback is in another thread, execution will not allow callbacks from listen_in_background. How would I go about being able to tell "Jarvis" to stop talking mid sentence?

The foundation is instructions from this video: https://www[.]youtube[.]com/watch?v=6zAk0KHmiGw

This is what I've got currently:

from os import system
import speech_recognition as sr
from playsound import playsound
from gpt4all import GPT4All
import whisper
import time
import os
import pyttsx3
import importlib
import threading

wake_word = "jarvis"
model = GPT4All("nous-hermes-llama2-13b.Q4_0.gguf", allow_download=False)
r = sr.Recognizer()
tiny_model = whisper.load_model("tiny")
base_model = whisper.load_model("base")
listening_for_wake_word = True
stop_talking = False
source = sr.Microphone()

def speak_thread(text): 
    importlib.reload(pyttsx3)
    engine = pyttsx3.init()
    engine.say(text)
    engine.runAndWait()

def speak(text):
    global stop_talking
    
    talk_thread = threading.Thread(daemon=True, target=speak_thread, name="talking", args=(text,)).start()
    
    while any("talking" in item.name for item in threading.enumerate()):
        if stop_talking:
            talk_thread.stop()
        time.sleep(1)
    
def listen_for_wake_word(audio):
    global listening_for_wake_word
    with open("wake_detect.wav", "wb") as f:
        f.write(audio.get_wav_data())
    result = tiny_model.transcribe("wake_detect.wav")
    text_input = result["text"]
    if wake_word in text_input.lower().strip():
        print("Wake word detected. Please speak your prompt to GPT4All.")
        speak("Listening")
        listening_for_wake_word = False

def prompt_gpt(audio): 
    global listening_for_wake_word
    try: 
        with open("prompt.wav", "wb") as f:
            f.write(audio.get_wav_data())
        result = base_model.transcribe("prompt.wav")
        prompt_text = result["text"]
        if len(prompt_text .strip()) == 0:
            print("I didn't catch that. Please repeat.")
            listening_for_wake_word = True
        else: 
            print("User: " + prompt_text)
            output = base_model.generate(prompt_text, max_tokens=500)
            print("GPT4All: ", output)
            speak(output)
            print("\nSay", wake_word, "to wake me up. \n")
            listening_for_wake_word = True
    except Exception as e: 
        print("Prompt error: ", e)
        
def callback (recognizer, audio): 
    global listening_for_wake_word
    global stop_talking
    
    print("Heard something")
    
    try:
        with open("temp.wav", "wb") as f:
            f.write(audio.get_wav_data())
        result = tiny_model.transcribe("wake_detect.wav")
        
        text_input = result["text"]
        
        if "stop" in text_input.lower().strip():
            print("Stopping playback...")
            stop_talking = True
            return
    except Exception as e: 
        print("Prompt error in stop part: ", e)
    
    if listening_for_wake_word: 
        listen_for_wake_word(audio)
    else: 
        prompt_gpt(audio)
    
def start_listening(): 
    with source as s: 
        r.adjust_for_ambient_noise(s, duration=2)
    print("\nSay", wake_word, "to wake me up. \n")
    r.listen_in_background(source, callback) # Why does this not send further callbacks until after audio playback?
    while True: # just to keep alive
        time.sleep(1)
        print(threading.enumerate())
        threading.get_ident()
        
if __name__ == "__main__":
    start_listening()

Initially "runAndWait()" seemed to be the issue but now I feel like maybe "listen_in_background" can't get a word in edgeways while any execution is ongoing. Is that was is happening and if so, is there a workaround? I want to be able to interrupt Jarvis mid sentence with a voice command to make it feel more interactive.

Hope any of you can help!

0

There are 0 best solutions below