not able to convert produce audio file in gradio

300 Views Asked by At

I am making Voice Assistant for ordering in restaurants, the VA takes voice input passes to a trained model, and gives output according to intent. For interface, I am using Gradio which takes voice input and shows input in text and its output in audio and text. The VA code is done in python. But in gradio, it takes input code in audio but gives an error in output for all the 3 cases shown.

Here is the python code:

import speech_recognition as sr
from gtts import gTTS
import os
import time
import datetime
import numpy as np
import pickle
from transformers import BertTokenizer
import torch.nn as nn
import torch
from flask import Flask, request, jsonify, render_template
import gradio as gr
from pymongo import MongoClient

# Add these variables to your code for MongoDB configuration
MONGO_URI = 'mongodb://localhost:27017'  # Update with your MongoDB URI
DB_NAME = 'orders'

class BERT_Arch(nn.Module):
   def __init__(self, bert):      
       super(BERT_Arch, self).__init__()
       self.bert = bert 
      
       # dropout layer
       self.dropout = nn.Dropout(0.2)
      
       # relu activation function
       self.relu =  nn.ReLU()
       # dense layer
       self.fc1 = nn.Linear(768,512)
       self.fc2 = nn.Linear(512,256)
       self.fc3 = nn.Linear(256,5)
       #softmax activation function
       self.softmax = nn.LogSoftmax(dim=1)
       #define the forward pass
   def forward(self, sent_id, mask):
      #pass the inputs to the model  
      cls_hs = self.bert(sent_id, attention_mask=mask)[0][:,0]
      
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      
      x = self.fc2(x)
      x = self.relu(x)
      x = self.dropout(x)
      # output layer
      x = self.fc3(x)
   
      # apply softmax activation
      x = self.softmax(x)
      return x

class ChatBot():
    def __init__(self, name, model_path):
        print("----- Starting up", name, "-----")
        self.name = name
        self.model = self.load_model(model_path)
        self.conversation_history = []
        self.user_orders = []

        # Initialize MongoDB client and database
        self.mongo_client = MongoClient(MONGO_URI)
        self.mongo_db = self.mongo_client[DB_NAME]
        self.orders_collection = self.mongo_db['orders']

    def load_model(self, model_path):
        try:
            with open(model_path, 'rb') as model_file:
                model = pickle.load(model_file)
            return model
        except Exception as e:
            print("Error loading the model:", str(e))
            return None

    def speech_to_text(self, audio):
        recognizer = sr.Recognizer()
        try:
            text = recognizer.recognize_google(audio)
            print("Me  --> ", text)
        except:
            print("Me  -->  ERROR")
            text = "ERROR"
        return text

    def detect_intent(self, text):
        if self.model is not None:
            # Load the BERT tokenizer and tokenize the input text
            tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

            # Ensure that the input tensor names match the names expected by your model
            sent_id = inputs["input_ids"]
            mask = inputs["attention_mask"]

            # Load the BERT model
            bert_model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')

            # Create an instance of your BERT_Arch model
            model = BERT_Arch(bert_model)

            # Make a prediction
            preds = model(sent_id, mask)

            # Assuming preds is a tensor with shape [batch_size, num_classes],
            # you can get the predicted class as follows:
            intent = torch.argmax(preds, dim=1).item()
            return intent
        else:
            return "Sorry, the model couldn't be loaded."

    def wake_up(self, text):
        return True if self.name in text.lower() else False

    def action_time(self):
        return datetime.datetime.now().time().strftime('%H:%M')
    
    def handle_user_input(self, user_input):
        # Log user input to conversation history
        self.log_to_history('User', user_input)

        if self.wake_up(user_input):
            res = "Hello, I am your food-assisstant. What would you like to order?"

        elif "time" in user_input:
            res = self.action_time()

        elif any(i in user_input for i in ["thank", "thanks"]):
            res = np.random.choice(["You're welcome!", "Anytime!", "No problem!", "Cool!", "I'm here if you need me!", "You're welcome!"])

        elif any(i in user_input for i in ["yourself"]):
            res = np.random.choice(["I am a food-ordering voice assistant!"])

        elif any(i in user_input for i in ["exit", "close"]):
            res = np.random.choice(["Tata", "Have a good day", "Bye", "Goodbye", "Hope to meet soon", "Peace out!"])
            self.log_to_history('ChatBot', res)
            self.text_to_speech(res)
            ex = False

        # Recognize user intent using the DistilBERT-based model
        else:
            # Check if the user is requesting the menu
            if any(i in user_input for i in ["menu", "options", "list"]):
                # Provide the menu options
                menu = "Here are our menu options:\n1. Item 1\n2. Item 2\n3. Item 3\n4. Item 4\n5. Item 5"
                res = menu
            # Check if the user is requesting to review their orders
            elif any(i in user_input for i in ["orders", "review", "my order"]):
                order_summary = self.review_orders()
                res = order_summary
            
            else:
                intent = self.detect_intent(user_input)
                if intent == 1:
                    res = "Sure I will add your order, that will be $30, anything else?"
                    # Log user order
                    self.user_orders.append(user_input)
                elif intent == 2:
                    res = "I have added newer items in the list as well that will be $55."
                    self.user_orders.append(user_input)
                elif intent == 3:
                    res = "That's a great choice, you want it in the veg or non-veg section?"
                    self.user_orders.append(user_input)
                elif intent == 4:
                    res = "Your order will be ready in 20 mins"
                    self.user_orders.append(user_input)
                elif intent == 5:
                    res = "Okay, I will make the quantity according to the specified number of people"
                    self.user_orders.append(user_input)
        
        # Log chatbot response to conversation history
        if 'res' in locals():
            self.log_to_history('ChatBot', res)
            self.text_to_speech(res)

    def review_orders(self):
        # Provide a summary of the user's orders
        order_summary = self.summarize_orders()
        self.text_to_speech(order_summary)

        self.log_order_to_mongodb(order_summary)

        self.text_to_speech(order_summary)

    def summarize_orders(self):
        # Generate a summary of the user's orders
        if not self.user_orders:
            return "You haven't placed any orders yet."

        summary = "Here's a summary of your orders:\n"
        for i, order in enumerate(self.user_orders, start=1):
            summary += f"{i}. {order}\n"

        return summary
    
    def transcribe(self, audio):
        result_text = self.speech_to_text(audio)
        out_result = self.handle_user_input(result_text)

        # Save the audio file if a response is generated
        if out_result:
            audioobj = gTTS(text=out_result, lang="en", slow=False)
            audioobj.save("res.mp3")
            # Wait for a brief moment to ensure the file is saved
            time.sleep(1)
        else:
            # If no response is generated, save an empty audio file
            open("res.mp3", "w").close()
        
        return [result_text, out_result, "res.mp3"]

    
    def log_order_to_mongodb(self, order):
        # Log the order to MongoDB with the current date and time
        order_doc = {
            'order': order,
            'timestamp': datetime.datetime.now()
        }
        self.orders_collection.insert_one(order_doc)
    
    def get_audio_from_microphone(self):
        # Replace this function with your code to capture audio from the microphone
        pass
    
    output_1 = gr.Textbox(label="speech_to_text")
    output_2 = gr.Textbox(label="handle_user_input")
    output_3 = gr.Audio("res.mp3")

    gr.Interface(
        title = 'Voice Assistant for Fast Food Restaurant', 
        fn=transcribe, 
        inputs=[
          gr.inputs.Audio(source="microphone", type="filepath")
     ],

     outputs=[
         output_1,  output_2, output_3
     ],
     live=True).launch()

if __name__ == "__main__":
    model_path = 'C:\\Users\\hp\\Desktop\\BOTSOMETHINGFROMNET\\model1.pickle'

    ai = ChatBot(name="dev", model_path=model_path)
    ex = True

    while ex:
        audio = ai.get_audio_from_microphone()
        ai.transcribe(audio)
        user_input = ai.text
        ai.handle_user_input(user_input)

    print("----- Closing down -----")
    app.run(debug=True)

Before giving audio input

enter image description here

it takes audio normally from microphone but gives error later

enter image description here

in VsCode the error is giving this:

 result_text = self.speech_to_text(audio)
AttributeError: 'str' object has no attribute 'speech_to_text'

I thought the system is not able to generate audio file correctly and maybe gtts error, I made a new function to generate audio file but it still gave the same error as above. Please help with this issue.

0

There are 0 best solutions below