Spaces:

Krishnavamshithumma
/

Voice-Bot-AI

Running

App Files Files Community

Krishnavamshithumma commited on 30 days ago

Commit

9126db3

verified ·

1 Parent(s): 12ef89e

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -166

app.py CHANGED Viewed

@@ -2,20 +2,17 @@ import gradio as gr
 from openai import OpenAI
 import speech_recognition as sr
 import os
-import io # For in-memory file handling
-import scipy.io.wavfile as wavfile # For writing/reading WAV data to/from in-memory file
-import numpy as np # To handle the audio array
-import datetime # For logging timestamps (not directly used in this version)
-# --- Fetch API Key from Environment Variable ---
-# This is the SECURE way to handle API keys in Hugging Face Spaces.
-# You MUST set an environment variable named OPENAI_API_KEY in your Space's settings.
-OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
-# --- Define the OpenAI Models to use ---
-OPENAI_STT_MODEL = "whisper-1" # Using Whisper for Speech-to-Text
-OPENAI_CHAT_MODEL = "gpt-3.5-turbo" # Using GPT-3.5 Turbo for chat responses
-OPENAI_TTS_MODEL = "tts-1" # Using TTS-1 for Text-to-Speech
 system_prompt = """
     You are a sophisticated AI voice bot representing Krishnavamshi Thumma. Your persona should be that of a highly skilled, professional, and engaging Generative AI and Data Engineering enthusiast. When responding to questions, embody the following detailed professional identity:
@@ -55,214 +52,112 @@ system_prompt = """
 # Initialize the SpeechRecognition Recognizer
 r = sr.Recognizer()
-# Modified function to accept audio as a numpy array and samplerate
 def transcribe_audio_and_chat(audio_tuple, history):
-    # Check if API key is available in environment
     if not OPENAI_API_KEY:
-        raise gr.Error("❌ OpenAI API key not found. Please set OPENAI_API_KEY as a Space Secret.")
-    # Handle cases where history might be None (defensive programming)
     if history is None:
         history = []
-    # Initialize tts_audio_output to None, so we always return it
-    tts_audio_output = None
     if audio_tuple is None:
-        # If no audio, raise a Gradio Error directly instead of adding to chat history
-        # Return history, history, None, None to clear inputs/outputs appropriately
-        return history, history, None, None
     samplerate, audio_np_array = audio_tuple
     try:
-        # Convert the NumPy array to a format speech_recognition can handle (in-memory WAV)
         if audio_np_array.dtype != np.int16:
-             audio_np_array = audio_np_array.astype(np.int16)
-        wav_byte_io = io.BytesIO()
-        wavfile.write(wav_byte_io, samplerate, audio_np_array)
-        wav_byte_io.seek(0) # Rewind to the beginning of the BytesIO object
-        # Create an AudioFile object from the in-memory WAV data
-        with sr.AudioFile(wav_byte_io) as source:
-            audio_data = r.record(source) # read the entire audio file
-        # --- Speech-to-Text (STT) ---
-        try:
-            # Using OpenAI's Whisper model for STT
             client = OpenAI(api_key=OPENAI_API_KEY)
-            # OpenAI's Whisper API typically expects audio in certain formats.
-            # While speech_recognition handles BytesIO, OpenAI's client.audio.transcriptions.create
-            # might prefer a direct file-like object or a path.
-            # For simplicity with BytesIO, we'll try to use speech_recognition's built-in recognizer.
-            # If you want to use OpenAI's ASR directly (e.g., Whisper), you'd need to adapt.
-            # For this code, we're sticking with `recognize_google` which uses Google's API by default.
-            user_input = r.recognize_google(audio_data) # This uses Google's STT (free tier usually)
-            # If you wanted to use OpenAI's Whisper ASR here, you'd do:
-            # audio_file_for_whisper = io.BytesIO(wav_byte_io.getvalue()) # Reset stream for Whisper
-            # audio_file_for_whisper.name = "audio.wav" # Whisper API needs a filename for BytesIO
-            # transcript = client.audio.transcriptions.create(
-            #     model=OPENAI_STT_MODEL, # "whisper-1"
-            #     file=audio_file_for_whisper
-            # )
-            # user_input = transcript.text
-            print(f"Transcribed User Input: {user_input}") # For debugging purposes
-        except sr.UnknownValueError:
-            history.append({"role": "assistant", "content": "Sorry, I could not understand the audio. Please try again."})
-            return history, history, None, tts_audio_output # Still clear inputs/outputs
-        except sr.RequestError as e:
-            history.append({"role": "assistant", "content": f"Could not request results from Speech Recognition service; {e}"})
-            return history, history, None, tts_audio_output # Still clear inputs/outputs
-        # --- Chat Completion ---
-        client = OpenAI(api_key=OPENAI_API_KEY)
         messages_for_openai = [{"role": "system", "content": system_prompt}] + history
         messages_for_openai.append({"role": "user", "content": user_input})
-        response = client.chat.completions.create(
             model=OPENAI_CHAT_MODEL,
             messages=messages_for_openai,
             temperature=0.7
         )
-        bot_reply = response.choices[0].message.content
         history.append({"role": "user", "content": user_input})
         history.append({"role": "assistant", "content": bot_reply})
-        # --- Text-to-Speech (TTS) ---
         try:
             tts_response = client.audio.speech.create(
-                model=OPENAI_TTS_MODEL, # "tts-1"
-                voice="alloy", # You can choose from "alloy", "echo", "fable", "onyx", "nova", "shimmer"
                 input=bot_reply,
-                response_format="wav" # Request WAV format for easy in-memory processing
             )
-            # Read the audio stream into a BytesIO object
-            tts_audio_bytes = io.BytesIO()
-            for chunk in tts_response.iter_bytes(chunk_size=4096):
-                tts_audio_bytes.write(chunk)
-            tts_audio_bytes.seek(0) # Rewind for reading
-            # Read the WAV data using scipy
-            tts_samplerate, tts_numpy_array = wavfile.read(tts_audio_bytes)
-            tts_audio_output = (tts_samplerate, tts_numpy_array) # Format for gr.Audio(type="numpy") output
         except Exception as tts_e:
-            print(f"Error generating TTS: {tts_e}")
-            # If TTS fails, log the error but don't stop the chat.
-            # The TTS audio output will just be None.
-            tts_audio_output = None # Ensure it's None if there's an error
-            history.append({"role": "assistant", "content": "(Voice generation failed.)"}) # Optional: notify user
-        # Return all required outputs: chatbot history, state history, cleared audio input, TTS audio
-        return history, history, None, tts_audio_output
     except Exception as e:
-        print(f"An unexpected error occurred: {e}")
-        # Ensure all outputs are returned even on a general error
-        raise gr.Error(f"❌ An unexpected error occurred: {str(e)}")
-# --- Gradio UI setup ---
 with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo:
     gr.Markdown("## 🎙️ Krishnavamshi Thumma - Voice Assistant")
-    gr.HTML("""
-    <style>
-        #chatBox {
-            height: 60vh;
-            overflow-y: auto;
-            padding: 20px;
-            border-radius: 10px;
-            background: #f9f9f9;
-            margin-bottom: 20px;
-        }
-        .message {
-            margin: 10px 0;
-            padding: 12px;
-            border-radius: 8px;
-        }
-        .user {
-            background: #e3f2fd;
-            text-align: right;
-        }
-        .bot {
-            background: #f5f5f5;
-        }
-        #audioInputComponent {
-            margin-top: 20px;
-        }
-        .key-status { /* Not strictly needed anymore but keeping for style consistency if other status messages arise */
-            padding: 5px;
-            margin-top: 5px;
-            border-radius: 4px;
-        }
-        .success {
-            background: #d4edda;
-            color: #155724;
-        }
-        .error {
-            background: #f8d7da;
-            color: #721c24;
-        }
-    </style>
-    """)
-    # --- UI Components ---
-    # Chatbot component to display messages
-    chatbot = gr.Chatbot(elem_id="chatBox", type="messages", height=400)
-    # State component to maintain chat history in OpenAI's message format
-    state = gr.State([])
-    # Audio input component for microphone recording
     audio_input = gr.Audio(
         sources=["microphone"],
-        type="numpy", # Receive audio as (samplerate, numpy_array)
         label="Speak your message here",
-        elem_id="audioInputComponent",
-        streaming=False # Process audio after full recording
     )
-    # New: Audio output component for TTS playback
     tts_audio_output = gr.Audio(
         label="Bot's Voice Response",
-        type="numpy", # Expects (samplerate, numpy_array) for playback
-        autoplay=True, # Automatically play the audio
-        waveform_options={
-            "skip_length": 0,
-            "waveform_color": "#2196F3",
-            "waveform_progress_color": "#4CAF50",
-            # Removed 'cursor_color' and 'unfilled_waveform_color' as they are not standard options here
-        }
     )
     clear_btn = gr.Button("🗑️ Clear Chat")
-    # Event handler for audio input change
     audio_input.change(
         fn=transcribe_audio_and_chat,
-        inputs=[audio_input, state], # api_key is now global
-        # Outputs: 1. chatbot display, 2. state (updated history),
-        # 3. audio_input (to clear it), 4. tts_audio_output (for playing bot's voice)
-        outputs=[chatbot, state, audio_input, tts_audio_output]
     )
-    # JavaScript (no changes needed for API key part here as it's removed)
-    gr.HTML("""
-    <script>
-        // You can add other useful JS here if needed in the future
-    </script>
-    """)
-    # Clear button functionality: resets chatbot and state to empty
-    # Also clear the TTS audio output when chat is cleared
     clear_btn.click(lambda: ([], [], None), None, [chatbot, state, tts_audio_output])
-demo.launch()

 from openai import OpenAI
 import speech_recognition as sr
 import os
+import io
+import tempfile
+import scipy.io.wavfile as wavfile
+import numpy as np
+import datetime
+# Load API key from environment
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+OPENAI_STT_MODEL = "whisper-1"
+OPENAI_CHAT_MODEL = "gpt-3.5-turbo"
+OPENAI_TTS_MODEL = "tts-1"
 system_prompt = """
     You are a sophisticated AI voice bot representing Krishnavamshi Thumma. Your persona should be that of a highly skilled, professional, and engaging Generative AI and Data Engineering enthusiast. When responding to questions, embody the following detailed professional identity:
 # Initialize the SpeechRecognition Recognizer
 r = sr.Recognizer()
 def transcribe_audio_and_chat(audio_tuple, history):
     if not OPENAI_API_KEY:
+        raise gr.Error("❌ OpenAI API key not found.")
     if history is None:
         history = []
+    audio_output_path = None  # Default output path to return (for TTS playback)
     if audio_tuple is None:
+        return history, history, None, None
     samplerate, audio_np_array = audio_tuple
     try:
         if audio_np_array.dtype != np.int16:
+            audio_np_array = audio_np_array.astype(np.int16)
+        # Save user audio temporarily for Whisper
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
+            wavfile.write(temp_audio_file.name, samplerate, audio_np_array)
+            temp_audio_file.flush()
+            # Use OpenAI Whisper STT
             client = OpenAI(api_key=OPENAI_API_KEY)
+            with open(temp_audio_file.name, "rb") as file:
+                transcript = client.audio.transcriptions.create(
+                    model=OPENAI_STT_MODEL,
+                    file=file
+                )
+                user_input = transcript.text
+        print(f"Transcribed Input: {user_input}")
+        # Chat Completion
         messages_for_openai = [{"role": "system", "content": system_prompt}] + history
         messages_for_openai.append({"role": "user", "content": user_input})
+        chat_response = client.chat.completions.create(
             model=OPENAI_CHAT_MODEL,
             messages=messages_for_openai,
             temperature=0.7
         )
+        bot_reply = chat_response.choices[0].message.content
         history.append({"role": "user", "content": user_input})
         history.append({"role": "assistant", "content": bot_reply})
+        # Generate TTS audio and save to temp file
         try:
             tts_response = client.audio.speech.create(
+                model=OPENAI_TTS_MODEL,
+                voice="alloy",
                 input=bot_reply,
+                response_format="mp3"
             )
+            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tts_temp_file:
+                for chunk in tts_response.iter_bytes():
+                    tts_temp_file.write(chunk)
+                audio_output_path = tts_temp_file.name
         except Exception as tts_e:
+            print(f"Error in TTS: {tts_e}")
+            history.append({"role": "assistant", "content": bot_reply + " (Voice failed to generate.)"})
+            audio_output_path = None
+        return history, history, None, audio_output_path
     except Exception as e:
+        print(f"Unexpected error: {e}")
+        raise gr.Error(f"❌ Unexpected error: {str(e)}")
+# Gradio UI
 with gr.Blocks(title="Voice Bot: Krishnavamshi Thumma") as demo:
     gr.Markdown("## 🎙️ Krishnavamshi Thumma - Voice Assistant")
+    chatbot = gr.Chatbot(type="messages", height=400)
+    state = gr.State([])
     audio_input = gr.Audio(
         sources=["microphone"],
+        type="numpy",
         label="Speak your message here",
+        streaming=False
     )
+    # Output as file path (so Gradio can handle autoplay correctly)
     tts_audio_output = gr.Audio(
         label="Bot's Voice Response",
+        type="filepath",
+        autoplay=True
     )
     clear_btn = gr.Button("🗑️ Clear Chat")
     audio_input.change(
         fn=transcribe_audio_and_chat,
+        inputs=[audio_input, state],
+        outputs=[chatbot, state, audio_input, tts_audio_output]
     )
     clear_btn.click(lambda: ([], [], None), None, [chatbot, state, tts_audio_output])
+demo.launch()