Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,8 +8,6 @@ import tempfile
|
|
8 |
import logging
|
9 |
import io
|
10 |
from pydub import AudioSegment
|
11 |
-
import wave
|
12 |
-
import pyaudio
|
13 |
|
14 |
# Set up logging
|
15 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
@@ -30,23 +28,21 @@ headers = {"Authorization": f"Bearer {hf_token}"}
|
|
30 |
chat_history = []
|
31 |
|
32 |
async def text_to_speech_stream(text, voice_volume=1.0):
|
33 |
-
"""Convert text to speech using edge_tts and
|
34 |
communicate = edge_tts.Communicate(text, "en-US-BrianMultilingualNeural")
|
35 |
audio_data = b""
|
36 |
|
37 |
async for chunk in communicate.stream():
|
38 |
if chunk["type"] == "audio":
|
39 |
audio_data += chunk["data"]
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
# Adjust volume for the final audio
|
44 |
audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
|
45 |
adjusted_audio = audio + (20 * voice_volume - 20) # Adjust volume (0.0 to 2.0)
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
|
51 |
def whisper_speech_to_text(audio_path):
|
52 |
"""Convert speech to text using Hugging Face Whisper API."""
|
@@ -89,21 +85,23 @@ async def chat_with_ai(message):
|
|
89 |
response_text = response.choices[0].message['content']
|
90 |
chat_history.append({"role": "assistant", "content": response_text})
|
91 |
|
92 |
-
|
|
|
|
|
93 |
except Exception as e:
|
94 |
logging.error(f"Error in chat_with_ai: {e}")
|
95 |
-
return str(e)
|
96 |
|
97 |
def transcribe_and_chat(audio):
|
98 |
if audio is None:
|
99 |
-
return "Sorry, no audio was provided. Please try recording again."
|
100 |
|
101 |
text = whisper_speech_to_text(audio)
|
102 |
if not text:
|
103 |
-
return "Sorry, I couldn't understand the audio or there was an error in transcription. Please try again."
|
104 |
|
105 |
-
response = asyncio.run(chat_with_ai(text))
|
106 |
-
return response
|
107 |
|
108 |
def create_demo():
|
109 |
with gr.Blocks(css="""
|
@@ -139,7 +137,7 @@ def create_demo():
|
|
139 |
|
140 |
with gr.Column(scale=1):
|
141 |
chat_output = gr.Textbox(label="π¬ AI Response", elem_id="chat-output", lines=5, interactive=False)
|
142 |
-
audio_output = gr.Audio(label="π AI Voice Response", autoplay=True, elem_id="audio-output"
|
143 |
|
144 |
# Add some spacing and a divider
|
145 |
gr.Markdown("<hr style='border: 1px solid #8ecae6;'/>")
|
@@ -149,27 +147,13 @@ def create_demo():
|
|
149 |
logging.info(f"Received audio: {audio}")
|
150 |
if audio is None:
|
151 |
return "No audio detected. Please try recording again.", None
|
152 |
-
response = transcribe_and_chat(audio)
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
def generate_audio():
|
158 |
-
for chunk in asyncio.run(text_to_speech_stream(response, volume)):
|
159 |
-
yield chunk
|
160 |
-
|
161 |
-
return gr.Audio(generate_audio(), streaming=True)
|
162 |
-
|
163 |
-
audio_input.change(
|
164 |
-
process_audio,
|
165 |
-
inputs=[audio_input, voice_volume],
|
166 |
-
outputs=[chat_output, audio_output]
|
167 |
-
).then(
|
168 |
-
stream_audio,
|
169 |
-
inputs=[chat_output, voice_volume],
|
170 |
-
outputs=audio_output
|
171 |
-
)
|
172 |
|
|
|
173 |
clear_button.click(lambda: (None, None), None, [chat_output, audio_output])
|
174 |
|
175 |
# JavaScript to handle autoplay, automatic submission, and auto-listen
|
|
|
8 |
import logging
|
9 |
import io
|
10 |
from pydub import AudioSegment
|
|
|
|
|
11 |
|
12 |
# Set up logging
|
13 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
28 |
chat_history = []
|
29 |
|
30 |
async def text_to_speech_stream(text, voice_volume=1.0):
|
31 |
+
"""Convert text to speech using edge_tts and return the audio file path."""
|
32 |
communicate = edge_tts.Communicate(text, "en-US-BrianMultilingualNeural")
|
33 |
audio_data = b""
|
34 |
|
35 |
async for chunk in communicate.stream():
|
36 |
if chunk["type"] == "audio":
|
37 |
audio_data += chunk["data"]
|
38 |
+
|
39 |
+
# Adjust volume
|
|
|
|
|
40 |
audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
|
41 |
adjusted_audio = audio + (20 * voice_volume - 20) # Adjust volume (0.0 to 2.0)
|
42 |
+
|
43 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
|
44 |
+
adjusted_audio.export(temp_file.name, format="mp3")
|
45 |
+
return temp_file.name
|
46 |
|
47 |
def whisper_speech_to_text(audio_path):
|
48 |
"""Convert speech to text using Hugging Face Whisper API."""
|
|
|
85 |
response_text = response.choices[0].message['content']
|
86 |
chat_history.append({"role": "assistant", "content": response_text})
|
87 |
|
88 |
+
audio_path = await text_to_speech_stream(response_text)
|
89 |
+
|
90 |
+
return response_text, audio_path
|
91 |
except Exception as e:
|
92 |
logging.error(f"Error in chat_with_ai: {e}")
|
93 |
+
return str(e), None
|
94 |
|
95 |
def transcribe_and_chat(audio):
|
96 |
if audio is None:
|
97 |
+
return "Sorry, no audio was provided. Please try recording again.", None
|
98 |
|
99 |
text = whisper_speech_to_text(audio)
|
100 |
if not text:
|
101 |
+
return "Sorry, I couldn't understand the audio or there was an error in transcription. Please try again.", None
|
102 |
|
103 |
+
response, audio_path = asyncio.run(chat_with_ai(text))
|
104 |
+
return response, audio_path
|
105 |
|
106 |
def create_demo():
|
107 |
with gr.Blocks(css="""
|
|
|
137 |
|
138 |
with gr.Column(scale=1):
|
139 |
chat_output = gr.Textbox(label="π¬ AI Response", elem_id="chat-output", lines=5, interactive=False)
|
140 |
+
audio_output = gr.Audio(label="π AI Voice Response", autoplay=True, elem_id="audio-output")
|
141 |
|
142 |
# Add some spacing and a divider
|
143 |
gr.Markdown("<hr style='border: 1px solid #8ecae6;'/>")
|
|
|
147 |
logging.info(f"Received audio: {audio}")
|
148 |
if audio is None:
|
149 |
return "No audio detected. Please try recording again.", None
|
150 |
+
response, audio_path = transcribe_and_chat(audio)
|
151 |
+
# Adjust volume for the response audio
|
152 |
+
adjusted_audio_path = asyncio.run(text_to_speech_stream(response, volume))
|
153 |
+
logging.info(f"Response: {response}, Audio path: {adjusted_audio_path}")
|
154 |
+
return response, adjusted_audio_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
+
audio_input.change(process_audio, inputs=[audio_input, voice_volume], outputs=[chat_output, audio_output])
|
157 |
clear_button.click(lambda: (None, None), None, [chat_output, audio_output])
|
158 |
|
159 |
# JavaScript to handle autoplay, automatic submission, and auto-listen
|