Spaces:
Sleeping
Sleeping
File size: 4,451 Bytes
1aef621 716173c 1aef621 46bf33d 5053ef7 46bf33d 1aef621 716173c 1aef621 716173c 1aef621 716173c 5053ef7 716173c 1aef621 ca1f683 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import gradio as gr
import asyncio
import edge_tts
import speech_recognition as sr
from pydub import AudioSegment
from pydub.playback import play
import os
from huggingface_hub import InferenceClient
import whisper
import torch
from io import BytesIO
import tempfile
# Get the Hugging Face token from environment variable
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN environment variable is not set")
# Initialize the Hugging Face Inference Client
client = InferenceClient(
"mistralai/Mistral-Nemo-Instruct-2407",
token=hf_token
)
# Load the Whisper model
whisper_model = whisper.load_model("tiny.en", device='cpu')
# Initialize an empty chat history
chat_history = []
async def text_to_speech_stream(text):
"""Convert text to speech using edge_tts and return the audio file path."""
communicate = edge_tts.Communicate(text, "en-US-AvaMultilingualNeural")
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
# Save the audio data to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
temp_file.write(audio_data)
return temp_file.name
def whisper_speech_to_text(audio):
"""Convert speech to text using Whisper model."""
try:
result = whisper_model.transcribe(audio)
text = result['text']
return text
except Exception as e:
print(f"Whisper Error: {e}")
return None
finally:
# Clear CUDA cache
torch.cuda.empty_cache()
async def chat_with_ai(message, history):
global chat_history
# Add user message to chat history
chat_history.append({"role": "user", "content": message})
try:
# Send chat completion request
response = client.chat_completion(
messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
max_tokens=800,
temperature=0.7
)
response_text = response.choices[0].message['content']
# Add assistant's response to chat history
chat_history.append({"role": "assistant", "content": response_text})
# Generate speech for the response
audio_path = await text_to_speech_stream(response_text)
return response_text, audio_path
except Exception as e:
print(f"Error: {e}")
return str(e), None
def transcribe_and_chat(audio):
# Transcribe audio to text
text = whisper_speech_to_text(audio)
if text is None:
return "Sorry, I couldn't understand the audio.", None
# Chat with AI using the transcribed text
response, audio_path = asyncio.run(chat_with_ai(text, []))
return response, audio_path
# Define the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# AI Voice Assistant")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Speak here", interactive=False)
text_input = gr.Textbox(label="Or type your message here")
with gr.Column():
chat_output = gr.Textbox(label="AI Response")
audio_output = gr.Audio(label="AI Voice Response", interactive=False)
audio_button = gr.Button("Send Audio")
text_button = gr.Button("Send Text")
# Add custom JavaScript to handle spacebar press and play audio automatically
demo.append(gr.HTML("""
<script>
document.addEventListener('keydown', function(event) {
if (event.code === 'Space') {
document.querySelector('input[type="file"]').click();
}
});
document.addEventListener('gradioAudioLoaded', function(event) {
var audioElement = document.querySelector('audio');
if (audioElement) {
audioElement.play();
}
});
</script>
"""))
audio_button.click(transcribe_and_chat, inputs=audio_input, outputs=[chat_output, audio_output])
text_button.click(lambda x: asyncio.run(chat_with_ai(x, [])), inputs=text_input, outputs=[chat_output, audio_output])
# Launch the Gradio app
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|