Spaces:

ar08
/

Voice-assitant

Sleeping

App Files Files Community

ar08 commited on Oct 15, 2024

Commit

3bcf6d8

verified ·

1 Parent(s): e2b03c7

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -60

app.py CHANGED Viewed

@@ -1,29 +1,26 @@
 import gradio as gr
 import asyncio
 import edge_tts
-import speech_recognition as sr
-from pydub import AudioSegment
-from pydub.playback import play
 import os
 from huggingface_hub import InferenceClient
 import whisper
 import torch
-from io import BytesIO
 import tempfile
 # Get the Hugging Face token from environment variable
-hf_token = os.environ.get("HF_TOKEN")
 if not hf_token:
     raise ValueError("HF_TOKEN environment variable is not set")
 # Initialize the Hugging Face Inference Client
-client = InferenceClient(
-    "mistralai/Mistral-Nemo-Instruct-2407",
-    token=hf_token
-)
 # Load the Whisper model
-whisper_model = whisper.load_model("tiny.en", device='cpu')
 # Initialize an empty chat history
 chat_history = []
@@ -37,7 +34,6 @@ async def text_to_speech_stream(text):
         if chunk["type"] == "audio":
             audio_data += chunk["data"]
-    # Save the audio data to a temporary file
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
         temp_file.write(audio_data)
         return temp_file.name
@@ -46,23 +42,20 @@ def whisper_speech_to_text(audio):
     """Convert speech to text using Whisper model."""
     try:
         result = whisper_model.transcribe(audio)
-        text = result['text']
-        return text
     except Exception as e:
         print(f"Whisper Error: {e}")
         return None
     finally:
-        # Clear CUDA cache
-        torch.cuda.empty_cache()
-async def chat_with_ai(message, history):
     global chat_history
-    # Add user message to chat history
     chat_history.append({"role": "user", "content": message})
     try:
-        # Send chat completion request
         response = client.chat_completion(
             messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
             max_tokens=800,
@@ -70,11 +63,8 @@ async def chat_with_ai(message, history):
         )
         response_text = response.choices[0].message['content']
-        # Add assistant's response to chat history
         chat_history.append({"role": "assistant", "content": response_text})
-        # Generate speech for the response
         audio_path = await text_to_speech_stream(response_text)
         return response_text, audio_path
@@ -83,52 +73,47 @@ async def chat_with_ai(message, history):
         return str(e), None
 def transcribe_and_chat(audio):
-    # Transcribe audio to text
     text = whisper_speech_to_text(audio)
     if text is None:
         return "Sorry, I couldn't understand the audio.", None
-    # Chat with AI using the transcribed text
-    response, audio_path = asyncio.run(chat_with_ai(text, []))
     return response, audio_path
 # Define the Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# AI Voice Assistant")
-    with gr.Row():
-        with gr.Column():
-            audio_input = gr.Audio(type="filepath", label="Speak here", interactive=False)
-            text_input = gr.Textbox(label="Or type your message here")
-        with gr.Column():
-            chat_output = gr.Textbox(label="AI Response")
-            audio_output = gr.Audio(label="AI Voice Response", interactive=False)
-    audio_button = gr.Button("Send Audio")
-    text_button = gr.Button("Send Text")
-    # Add custom JavaScript to handle spacebar press and play audio automatically
-    demo.append(gr.HTML("""
-        <script>
-            document.addEventListener('keydown', function(event) {
-                if (event.code === 'Space') {
-                    document.querySelector('input[type="file"]').click();
-                }
-            });
-            document.addEventListener('gradioAudioLoaded', function(event) {
-                var audioElement = document.querySelector('audio');
-                if (audioElement) {
-                    audioElement.play();
-                }
-            });
-        </script>
-    """))
-    audio_button.click(transcribe_and_chat, inputs=audio_input, outputs=[chat_output, audio_output])
-    text_button.click(lambda x: asyncio.run(chat_with_ai(x, [])), inputs=text_input, outputs=[chat_output, audio_output])
 # Launch the Gradio app
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import asyncio
 import edge_tts
 import os
 from huggingface_hub import InferenceClient
 import whisper
 import torch
 import tempfile
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
 # Get the Hugging Face token from environment variable
+hf_token = os.getenv("HF_TOKEN")
 if not hf_token:
     raise ValueError("HF_TOKEN environment variable is not set")
 # Initialize the Hugging Face Inference Client
+client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=hf_token)
 # Load the Whisper model
+whisper_model = whisper.load_model("tiny.en", device='cuda' if torch.cuda.is_available() else 'cpu')
 # Initialize an empty chat history
 chat_history = []
         if chunk["type"] == "audio":
             audio_data += chunk["data"]
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
         temp_file.write(audio_data)
         return temp_file.name
     """Convert speech to text using Whisper model."""
     try:
         result = whisper_model.transcribe(audio)
+        return result['text']
     except Exception as e:
         print(f"Whisper Error: {e}")
         return None
     finally:
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+async def chat_with_ai(message):
     global chat_history
     chat_history.append({"role": "user", "content": message})
     try:
         response = client.chat_completion(
             messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
             max_tokens=800,
         )
         response_text = response.choices[0].message['content']
         chat_history.append({"role": "assistant", "content": response_text})
         audio_path = await text_to_speech_stream(response_text)
         return response_text, audio_path
         return str(e), None
 def transcribe_and_chat(audio):
     text = whisper_speech_to_text(audio)
     if text is None:
         return "Sorry, I couldn't understand the audio.", None
+    response, audio_path = asyncio.run(chat_with_ai(text))
     return response, audio_path
 # Define the Gradio interface
+def create_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# AI Voice Assistant")
+        with gr.Row():
+            with gr.Column(scale=1):
+                audio_input = gr.Audio(source="microphone", type="filepath", label="Press 'Record' to Speak")
+            with gr.Column(scale=1):
+                chat_output = gr.Textbox(label="AI Response")
+                audio_output = gr.Audio(label="AI Voice Response")
+        demo.load(None, js="""
+            function() {
+                document.querySelector("audio").addEventListener("stop", function() {
+                    setTimeout(function() {
+                        document.querySelector('button[title="Submit"]').click();
+                    }, 500);
+                });
+                document.addEventListener('gradioAudioLoaded', function(event) {
+                    var audioElement = document.querySelector('audio');
+                    if (audioElement) {
+                        audioElement.play();
+                    }
+                });
+            }
+        """)
+        audio_input.change(transcribe_and_chat, inputs=audio_input, outputs=[chat_output, audio_output])
+    return demo
 # Launch the Gradio app
 if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(server_name="0.0.0.0", server_port=7860)