Voice-assitant / app.py
ar08's picture
Update app.py
03c0141 verified
raw
history blame
4.42 kB
import gradio as gr
import asyncio
import edge_tts
import os
from huggingface_hub import InferenceClient
import whisper
import torch
import tempfile
# Get the Hugging Face token from environment variable
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN environment variable is not set")
# Initialize the Hugging Face Inference Client
client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=hf_token)
# Load the Whisper model
whisper_model = whisper.load_model("tiny.en", device='cuda' if torch.cuda.is_available() else 'cpu')
# Initialize an empty chat history
chat_history = []
async def text_to_speech_stream(text):
"""Convert text to speech using edge_tts and return the audio file path."""
communicate = edge_tts.Communicate(text, "en-US-AvaMultilingualNeural")
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
temp_file.write(audio_data)
return temp_file.name
def whisper_speech_to_text(audio):
"""Convert speech to text using Whisper model."""
try:
result = whisper_model.transcribe(audio)
return result['text']
except Exception as e:
print(f"Whisper Error: {e}")
return None
finally:
if torch.cuda.is_available():
torch.cuda.empty_cache()
async def chat_with_ai(message):
global chat_history
chat_history.append({"role": "user", "content": message})
try:
response = client.chat_completion(
messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
max_tokens=800,
temperature=0.7
)
response_text = response.choices[0].message['content']
chat_history.append({"role": "assistant", "content": response_text})
audio_path = await text_to_speech_stream(response_text)
return response_text, audio_path
except Exception as e:
print(f"Error: {e}")
return str(e), None
def transcribe_and_chat(audio):
text = whisper_speech_to_text(audio)
if text is None:
return "Sorry, I couldn't understand the audio.", None
response, audio_path = asyncio.run(chat_with_ai(text))
return response, audio_path
def create_demo():
with gr.Blocks() as demo:
gr.Markdown("# AI Voice Assistant")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(type="filepath", label="Press 'Record' to Speak")
with gr.Column(scale=1):
chat_output = gr.Textbox(label="AI Response")
audio_output = gr.Audio(label="AI Voice Response", autoplay=True)
def process_audio(audio):
response, audio_path = transcribe_and_chat(audio)
return response, audio_path, None # Return None to clear the audio input
demo.load(None, js="""
function() {
document.querySelector("audio").addEventListener("stop", function() {
setTimeout(function() {
document.querySelector('button[title="Submit"]').click();
}, 500);
});
function playAssistantAudio() {
var audioElements = document.querySelectorAll('audio');
if (audioElements.length > 1) {
var assistantAudio = audioElements[1];
if (assistantAudio) {
assistantAudio.play();
}
}
}
document.addEventListener('gradioAudioLoaded', function(event) {
playAssistantAudio();
});
document.addEventListener('gradioUpdated', function(event) {
setTimeout(playAssistantAudio, 100);
});
}
""")
audio_input.change(process_audio, inputs=[audio_input], outputs=[chat_output, audio_output, audio_input])
return demo
# Launch the Gradio app
if __name__ == "__main__":
demo = create_demo()
demo.launch(server_name="0.0.0.0", server_port=7860)