Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,29 +1,26 @@
|
|
1 |
import gradio as gr
|
2 |
import asyncio
|
3 |
import edge_tts
|
4 |
-
import speech_recognition as sr
|
5 |
-
from pydub import AudioSegment
|
6 |
-
from pydub.playback import play
|
7 |
import os
|
8 |
from huggingface_hub import InferenceClient
|
9 |
import whisper
|
10 |
import torch
|
11 |
-
from io import BytesIO
|
12 |
import tempfile
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Get the Hugging Face token from environment variable
|
15 |
-
hf_token = os.
|
16 |
if not hf_token:
|
17 |
raise ValueError("HF_TOKEN environment variable is not set")
|
18 |
|
19 |
# Initialize the Hugging Face Inference Client
|
20 |
-
client = InferenceClient(
|
21 |
-
"mistralai/Mistral-Nemo-Instruct-2407",
|
22 |
-
token=hf_token
|
23 |
-
)
|
24 |
|
25 |
# Load the Whisper model
|
26 |
-
whisper_model = whisper.load_model("tiny.en", device='cpu')
|
27 |
|
28 |
# Initialize an empty chat history
|
29 |
chat_history = []
|
@@ -37,7 +34,6 @@ async def text_to_speech_stream(text):
|
|
37 |
if chunk["type"] == "audio":
|
38 |
audio_data += chunk["data"]
|
39 |
|
40 |
-
# Save the audio data to a temporary file
|
41 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
|
42 |
temp_file.write(audio_data)
|
43 |
return temp_file.name
|
@@ -46,23 +42,20 @@ def whisper_speech_to_text(audio):
|
|
46 |
"""Convert speech to text using Whisper model."""
|
47 |
try:
|
48 |
result = whisper_model.transcribe(audio)
|
49 |
-
|
50 |
-
return text
|
51 |
except Exception as e:
|
52 |
print(f"Whisper Error: {e}")
|
53 |
return None
|
54 |
finally:
|
55 |
-
|
56 |
-
|
57 |
|
58 |
-
async def chat_with_ai(message
|
59 |
global chat_history
|
60 |
|
61 |
-
# Add user message to chat history
|
62 |
chat_history.append({"role": "user", "content": message})
|
63 |
|
64 |
try:
|
65 |
-
# Send chat completion request
|
66 |
response = client.chat_completion(
|
67 |
messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
|
68 |
max_tokens=800,
|
@@ -70,11 +63,8 @@ async def chat_with_ai(message, history):
|
|
70 |
)
|
71 |
|
72 |
response_text = response.choices[0].message['content']
|
73 |
-
|
74 |
-
# Add assistant's response to chat history
|
75 |
chat_history.append({"role": "assistant", "content": response_text})
|
76 |
|
77 |
-
# Generate speech for the response
|
78 |
audio_path = await text_to_speech_stream(response_text)
|
79 |
|
80 |
return response_text, audio_path
|
@@ -83,52 +73,47 @@ async def chat_with_ai(message, history):
|
|
83 |
return str(e), None
|
84 |
|
85 |
def transcribe_and_chat(audio):
|
86 |
-
# Transcribe audio to text
|
87 |
text = whisper_speech_to_text(audio)
|
88 |
if text is None:
|
89 |
return "Sorry, I couldn't understand the audio.", None
|
90 |
|
91 |
-
|
92 |
-
response, audio_path = asyncio.run(chat_with_ai(text, []))
|
93 |
return response, audio_path
|
94 |
|
95 |
# Define the Gradio interface
|
96 |
-
|
97 |
-
gr.
|
98 |
-
|
99 |
-
|
100 |
-
with gr.
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
document.querySelector('
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
"""))
|
128 |
-
|
129 |
-
audio_button.click(transcribe_and_chat, inputs=audio_input, outputs=[chat_output, audio_output])
|
130 |
-
text_button.click(lambda x: asyncio.run(chat_with_ai(x, [])), inputs=text_input, outputs=[chat_output, audio_output])
|
131 |
|
132 |
# Launch the Gradio app
|
133 |
if __name__ == "__main__":
|
134 |
-
demo
|
|
|
|
1 |
import gradio as gr
|
2 |
import asyncio
|
3 |
import edge_tts
|
|
|
|
|
|
|
4 |
import os
|
5 |
from huggingface_hub import InferenceClient
|
6 |
import whisper
|
7 |
import torch
|
|
|
8 |
import tempfile
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
|
11 |
+
# Load environment variables
|
12 |
+
load_dotenv()
|
13 |
|
14 |
# Get the Hugging Face token from environment variable
|
15 |
+
hf_token = os.getenv("HF_TOKEN")
|
16 |
if not hf_token:
|
17 |
raise ValueError("HF_TOKEN environment variable is not set")
|
18 |
|
19 |
# Initialize the Hugging Face Inference Client
|
20 |
+
client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=hf_token)
|
|
|
|
|
|
|
21 |
|
22 |
# Load the Whisper model
|
23 |
+
whisper_model = whisper.load_model("tiny.en", device='cuda' if torch.cuda.is_available() else 'cpu')
|
24 |
|
25 |
# Initialize an empty chat history
|
26 |
chat_history = []
|
|
|
34 |
if chunk["type"] == "audio":
|
35 |
audio_data += chunk["data"]
|
36 |
|
|
|
37 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
|
38 |
temp_file.write(audio_data)
|
39 |
return temp_file.name
|
|
|
42 |
"""Convert speech to text using Whisper model."""
|
43 |
try:
|
44 |
result = whisper_model.transcribe(audio)
|
45 |
+
return result['text']
|
|
|
46 |
except Exception as e:
|
47 |
print(f"Whisper Error: {e}")
|
48 |
return None
|
49 |
finally:
|
50 |
+
if torch.cuda.is_available():
|
51 |
+
torch.cuda.empty_cache()
|
52 |
|
53 |
+
async def chat_with_ai(message):
|
54 |
global chat_history
|
55 |
|
|
|
56 |
chat_history.append({"role": "user", "content": message})
|
57 |
|
58 |
try:
|
|
|
59 |
response = client.chat_completion(
|
60 |
messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
|
61 |
max_tokens=800,
|
|
|
63 |
)
|
64 |
|
65 |
response_text = response.choices[0].message['content']
|
|
|
|
|
66 |
chat_history.append({"role": "assistant", "content": response_text})
|
67 |
|
|
|
68 |
audio_path = await text_to_speech_stream(response_text)
|
69 |
|
70 |
return response_text, audio_path
|
|
|
73 |
return str(e), None
|
74 |
|
75 |
def transcribe_and_chat(audio):
|
|
|
76 |
text = whisper_speech_to_text(audio)
|
77 |
if text is None:
|
78 |
return "Sorry, I couldn't understand the audio.", None
|
79 |
|
80 |
+
response, audio_path = asyncio.run(chat_with_ai(text))
|
|
|
81 |
return response, audio_path
|
82 |
|
83 |
# Define the Gradio interface
|
84 |
+
def create_demo():
|
85 |
+
with gr.Blocks() as demo:
|
86 |
+
gr.Markdown("# AI Voice Assistant")
|
87 |
+
|
88 |
+
with gr.Row():
|
89 |
+
with gr.Column(scale=1):
|
90 |
+
audio_input = gr.Audio(source="microphone", type="filepath", label="Press 'Record' to Speak")
|
91 |
+
|
92 |
+
with gr.Column(scale=1):
|
93 |
+
chat_output = gr.Textbox(label="AI Response")
|
94 |
+
audio_output = gr.Audio(label="AI Voice Response")
|
95 |
+
|
96 |
+
demo.load(None, js="""
|
97 |
+
function() {
|
98 |
+
document.querySelector("audio").addEventListener("stop", function() {
|
99 |
+
setTimeout(function() {
|
100 |
+
document.querySelector('button[title="Submit"]').click();
|
101 |
+
}, 500);
|
102 |
+
});
|
103 |
+
document.addEventListener('gradioAudioLoaded', function(event) {
|
104 |
+
var audioElement = document.querySelector('audio');
|
105 |
+
if (audioElement) {
|
106 |
+
audioElement.play();
|
107 |
+
}
|
108 |
+
});
|
109 |
+
}
|
110 |
+
""")
|
111 |
+
|
112 |
+
audio_input.change(transcribe_and_chat, inputs=audio_input, outputs=[chat_output, audio_output])
|
113 |
+
|
114 |
+
return demo
|
|
|
|
|
|
|
|
|
115 |
|
116 |
# Launch the Gradio app
|
117 |
if __name__ == "__main__":
|
118 |
+
demo = create_demo()
|
119 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|