Spaces:

ar08
/

Voice-assitant

Running

App Files Files Community

ar08 commited on Oct 16, 2024

Commit

15450b6

verified ·

1 Parent(s): afd2929

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -38

app.py CHANGED Viewed

@@ -8,8 +8,6 @@ import tempfile
 import logging
 import io
 from pydub import AudioSegment
-import wave
-import pyaudio
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -30,23 +28,21 @@ headers = {"Authorization": f"Bearer {hf_token}"}
 chat_history = []
 async def text_to_speech_stream(text, voice_volume=1.0):
-    """Convert text to speech using edge_tts and yield audio chunks."""
     communicate = edge_tts.Communicate(text, "en-US-BrianMultilingualNeural")
     audio_data = b""
     async for chunk in communicate.stream():
         if chunk["type"] == "audio":
             audio_data += chunk["data"]
-            # Yield chunks for streaming
-            yield audio_data
-    # Adjust volume for the final audio
     audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
     adjusted_audio = audio + (20 * voice_volume - 20)  # Adjust volume (0.0 to 2.0)
-    buffer = io.BytesIO()
-    adjusted_audio.export(buffer, format="wav")
-    yield buffer.getvalue()
 def whisper_speech_to_text(audio_path):
     """Convert speech to text using Hugging Face Whisper API."""
@@ -89,21 +85,23 @@ async def chat_with_ai(message):
         response_text = response.choices[0].message['content']
         chat_history.append({"role": "assistant", "content": response_text})
-        return response_text
     except Exception as e:
         logging.error(f"Error in chat_with_ai: {e}")
-        return str(e)
 def transcribe_and_chat(audio):
     if audio is None:
-        return "Sorry, no audio was provided. Please try recording again."
     text = whisper_speech_to_text(audio)
     if not text:
-        return "Sorry, I couldn't understand the audio or there was an error in transcription. Please try again."
-    response = asyncio.run(chat_with_ai(text))
-    return response
 def create_demo():
     with gr.Blocks(css="""
@@ -139,7 +137,7 @@ def create_demo():
             with gr.Column(scale=1):
                 chat_output = gr.Textbox(label="💬 AI Response", elem_id="chat-output", lines=5, interactive=False)
-                audio_output = gr.Audio(label="🔊 AI Voice Response", autoplay=True, elem_id="audio-output", streaming=True)
         # Add some spacing and a divider
         gr.Markdown("<hr style='border: 1px solid #8ecae6;'/>")
@@ -149,27 +147,13 @@ def create_demo():
             logging.info(f"Received audio: {audio}")
             if audio is None:
                 return "No audio detected. Please try recording again.", None
-            response = transcribe_and_chat(audio)
-            logging.info(f"Response: {response}")
-            return response, None
-        def stream_audio(response, volume):
-            def generate_audio():
-                for chunk in asyncio.run(text_to_speech_stream(response, volume)):
-                    yield chunk
-            return gr.Audio(generate_audio(), streaming=True)
-        audio_input.change(
-            process_audio,
-            inputs=[audio_input, voice_volume],
-            outputs=[chat_output, audio_output]
-        ).then(
-            stream_audio,
-            inputs=[chat_output, voice_volume],
-            outputs=audio_output
-        )
         clear_button.click(lambda: (None, None), None, [chat_output, audio_output])
         # JavaScript to handle autoplay, automatic submission, and auto-listen

 import logging
 import io
 from pydub import AudioSegment
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 chat_history = []
 async def text_to_speech_stream(text, voice_volume=1.0):
+    """Convert text to speech using edge_tts and return the audio file path."""
     communicate = edge_tts.Communicate(text, "en-US-BrianMultilingualNeural")
     audio_data = b""
     async for chunk in communicate.stream():
         if chunk["type"] == "audio":
             audio_data += chunk["data"]
+    # Adjust volume
     audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
     adjusted_audio = audio + (20 * voice_volume - 20)  # Adjust volume (0.0 to 2.0)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
+        adjusted_audio.export(temp_file.name, format="mp3")
+        return temp_file.name
 def whisper_speech_to_text(audio_path):
     """Convert speech to text using Hugging Face Whisper API."""
         response_text = response.choices[0].message['content']
         chat_history.append({"role": "assistant", "content": response_text})
+        audio_path = await text_to_speech_stream(response_text)
+        return response_text, audio_path
     except Exception as e:
         logging.error(f"Error in chat_with_ai: {e}")
+        return str(e), None
 def transcribe_and_chat(audio):
     if audio is None:
+        return "Sorry, no audio was provided. Please try recording again.", None
     text = whisper_speech_to_text(audio)
     if not text:
+        return "Sorry, I couldn't understand the audio or there was an error in transcription. Please try again.", None
+    response, audio_path = asyncio.run(chat_with_ai(text))
+    return response, audio_path
 def create_demo():
     with gr.Blocks(css="""
             with gr.Column(scale=1):
                 chat_output = gr.Textbox(label="💬 AI Response", elem_id="chat-output", lines=5, interactive=False)
+                audio_output = gr.Audio(label="🔊 AI Voice Response", autoplay=True, elem_id="audio-output")
         # Add some spacing and a divider
         gr.Markdown("<hr style='border: 1px solid #8ecae6;'/>")
             logging.info(f"Received audio: {audio}")
             if audio is None:
                 return "No audio detected. Please try recording again.", None
+            response, audio_path = transcribe_and_chat(audio)
+            # Adjust volume for the response audio
+            adjusted_audio_path = asyncio.run(text_to_speech_stream(response, volume))
+            logging.info(f"Response: {response}, Audio path: {adjusted_audio_path}")
+            return response, adjusted_audio_path
+        audio_input.change(process_audio, inputs=[audio_input, voice_volume], outputs=[chat_output, audio_output])
         clear_button.click(lambda: (None, None), None, [chat_output, audio_output])
         # JavaScript to handle autoplay, automatic submission, and auto-listen