I am making Voice Assistant for ordering in restaurants, the VA takes voice input passes to a trained model, and gives output according to intent. For interface, I am using Gradio which takes voice input and shows input in text and its output in audio and text. The VA code is done in python. But in gradio, it takes input code in audio but gives an error in output for all the 3 cases shown.
Here is the python code:
import speech_recognition as sr
from gtts import gTTS
import os
import time
import datetime
import numpy as np
import pickle
from transformers import BertTokenizer
import torch.nn as nn
import torch
from flask import Flask, request, jsonify, render_template
import gradio as gr
from pymongo import MongoClient
# Add these variables to your code for MongoDB configuration
MONGO_URI = 'mongodb://localhost:27017' # Update with your MongoDB URI
DB_NAME = 'orders'
class BERT_Arch(nn.Module):
def __init__(self, bert):
super(BERT_Arch, self).__init__()
self.bert = bert
# dropout layer
self.dropout = nn.Dropout(0.2)
# relu activation function
self.relu = nn.ReLU()
# dense layer
self.fc1 = nn.Linear(768,512)
self.fc2 = nn.Linear(512,256)
self.fc3 = nn.Linear(256,5)
#softmax activation function
self.softmax = nn.LogSoftmax(dim=1)
#define the forward pass
def forward(self, sent_id, mask):
#pass the inputs to the model
cls_hs = self.bert(sent_id, attention_mask=mask)[0][:,0]
x = self.fc1(cls_hs)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.relu(x)
x = self.dropout(x)
# output layer
x = self.fc3(x)
# apply softmax activation
x = self.softmax(x)
return x
class ChatBot():
def __init__(self, name, model_path):
print("----- Starting up", name, "-----")
self.name = name
self.model = self.load_model(model_path)
self.conversation_history = []
self.user_orders = []
# Initialize MongoDB client and database
self.mongo_client = MongoClient(MONGO_URI)
self.mongo_db = self.mongo_client[DB_NAME]
self.orders_collection = self.mongo_db['orders']
def load_model(self, model_path):
try:
with open(model_path, 'rb') as model_file:
model = pickle.load(model_file)
return model
except Exception as e:
print("Error loading the model:", str(e))
return None
def speech_to_text(self, audio):
recognizer = sr.Recognizer()
try:
text = recognizer.recognize_google(audio)
print("Me --> ", text)
except:
print("Me --> ERROR")
text = "ERROR"
return text
def detect_intent(self, text):
if self.model is not None:
# Load the BERT tokenizer and tokenize the input text
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
# Ensure that the input tensor names match the names expected by your model
sent_id = inputs["input_ids"]
mask = inputs["attention_mask"]
# Load the BERT model
bert_model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')
# Create an instance of your BERT_Arch model
model = BERT_Arch(bert_model)
# Make a prediction
preds = model(sent_id, mask)
# Assuming preds is a tensor with shape [batch_size, num_classes],
# you can get the predicted class as follows:
intent = torch.argmax(preds, dim=1).item()
return intent
else:
return "Sorry, the model couldn't be loaded."
def wake_up(self, text):
return True if self.name in text.lower() else False
def action_time(self):
return datetime.datetime.now().time().strftime('%H:%M')
def handle_user_input(self, user_input):
# Log user input to conversation history
self.log_to_history('User', user_input)
if self.wake_up(user_input):
res = "Hello, I am your food-assisstant. What would you like to order?"
elif "time" in user_input:
res = self.action_time()
elif any(i in user_input for i in ["thank", "thanks"]):
res = np.random.choice(["You're welcome!", "Anytime!", "No problem!", "Cool!", "I'm here if you need me!", "You're welcome!"])
elif any(i in user_input for i in ["yourself"]):
res = np.random.choice(["I am a food-ordering voice assistant!"])
elif any(i in user_input for i in ["exit", "close"]):
res = np.random.choice(["Tata", "Have a good day", "Bye", "Goodbye", "Hope to meet soon", "Peace out!"])
self.log_to_history('ChatBot', res)
self.text_to_speech(res)
ex = False
# Recognize user intent using the DistilBERT-based model
else:
# Check if the user is requesting the menu
if any(i in user_input for i in ["menu", "options", "list"]):
# Provide the menu options
menu = "Here are our menu options:\n1. Item 1\n2. Item 2\n3. Item 3\n4. Item 4\n5. Item 5"
res = menu
# Check if the user is requesting to review their orders
elif any(i in user_input for i in ["orders", "review", "my order"]):
order_summary = self.review_orders()
res = order_summary
else:
intent = self.detect_intent(user_input)
if intent == 1:
res = "Sure I will add your order, that will be $30, anything else?"
# Log user order
self.user_orders.append(user_input)
elif intent == 2:
res = "I have added newer items in the list as well that will be $55."
self.user_orders.append(user_input)
elif intent == 3:
res = "That's a great choice, you want it in the veg or non-veg section?"
self.user_orders.append(user_input)
elif intent == 4:
res = "Your order will be ready in 20 mins"
self.user_orders.append(user_input)
elif intent == 5:
res = "Okay, I will make the quantity according to the specified number of people"
self.user_orders.append(user_input)
# Log chatbot response to conversation history
if 'res' in locals():
self.log_to_history('ChatBot', res)
self.text_to_speech(res)
def review_orders(self):
# Provide a summary of the user's orders
order_summary = self.summarize_orders()
self.text_to_speech(order_summary)
self.log_order_to_mongodb(order_summary)
self.text_to_speech(order_summary)
def summarize_orders(self):
# Generate a summary of the user's orders
if not self.user_orders:
return "You haven't placed any orders yet."
summary = "Here's a summary of your orders:\n"
for i, order in enumerate(self.user_orders, start=1):
summary += f"{i}. {order}\n"
return summary
def transcribe(self, audio):
result_text = self.speech_to_text(audio)
out_result = self.handle_user_input(result_text)
# Save the audio file if a response is generated
if out_result:
audioobj = gTTS(text=out_result, lang="en", slow=False)
audioobj.save("res.mp3")
# Wait for a brief moment to ensure the file is saved
time.sleep(1)
else:
# If no response is generated, save an empty audio file
open("res.mp3", "w").close()
return [result_text, out_result, "res.mp3"]
def log_order_to_mongodb(self, order):
# Log the order to MongoDB with the current date and time
order_doc = {
'order': order,
'timestamp': datetime.datetime.now()
}
self.orders_collection.insert_one(order_doc)
def get_audio_from_microphone(self):
# Replace this function with your code to capture audio from the microphone
pass
output_1 = gr.Textbox(label="speech_to_text")
output_2 = gr.Textbox(label="handle_user_input")
output_3 = gr.Audio("res.mp3")
gr.Interface(
title = 'Voice Assistant for Fast Food Restaurant',
fn=transcribe,
inputs=[
gr.inputs.Audio(source="microphone", type="filepath")
],
outputs=[
output_1, output_2, output_3
],
live=True).launch()
if __name__ == "__main__":
model_path = 'C:\\Users\\hp\\Desktop\\BOTSOMETHINGFROMNET\\model1.pickle'
ai = ChatBot(name="dev", model_path=model_path)
ex = True
while ex:
audio = ai.get_audio_from_microphone()
ai.transcribe(audio)
user_input = ai.text
ai.handle_user_input(user_input)
print("----- Closing down -----")
app.run(debug=True)
Before giving audio input
it takes audio normally from microphone but gives error later
in VsCode the error is giving this:
result_text = self.speech_to_text(audio)
AttributeError: 'str' object has no attribute 'speech_to_text'
I thought the system is not able to generate audio file correctly and maybe gtts error, I made a new function to generate audio file but it still gave the same error as above. Please help with this issue.