Spaces:

Devakumar868
/

my-voice-assistant

Paused

App Files Files Community

Devakumar868 commited on Jun 22

Commit

326e0ae

verified ·

1 Parent(s): 95ae54e

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -181

app.py CHANGED Viewed

@@ -75,7 +75,7 @@ def load_models():
     print("🚀 Loading Maya AI models...")
-    # Load ASR model (Whisper) - FIXED VERSION
     print("🎤 Loading Whisper for ASR...")
     try:
         asr_pipe = pipeline(
@@ -83,7 +83,6 @@ def load_models():
             model="openai/whisper-base",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device=0 if torch.cuda.is_available() else -1
-            # Removed return_timestamps and other problematic parameters
         )
         print("✅ Whisper ASR loaded successfully!")
     except Exception as e:
@@ -125,7 +124,6 @@ def load_models():
             print(f"⚠️ Dia TTS failed to load: {e}")
             tts_model = None
-    # Continue without TTS (text-only mode)
     print("⚠️ No TTS available, running in text-only mode")
     tts_type = "none"
     return True
@@ -134,49 +132,39 @@ def detect_emotion_from_text(text):
     """Enhanced emotion detection from text"""
     text_lower = text.lower()
-    # Enhanced emotion keywords with weights
     emotions = {
         'happy': ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing',
-                 'fantastic', 'excellent', 'brilliant', 'perfect', 'love', 'joy', 'cheerful',
-                 'delighted', 'thrilled', 'ecstatic'],
         'sad': ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed',
-               'miserable', 'heartbroken', 'devastated', 'gloomy', 'melancholy', 'down',
-               'blue', 'sorrowful'],
         'angry': ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate', 'rage',
-                 'irritated', 'outraged', 'livid', 'enraged', 'pissed', 'irate'],
         'surprised': ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking',
-                     'astonishing', 'remarkable', 'extraordinary', 'mind-blowing',
-                     'amazing', 'stunning'],
-        'fearful': ['scared', 'afraid', 'terrified', 'worried', 'anxious', 'nervous',
-                   'frightened', 'panic', 'concerned', 'fearful'],
-        'disgusted': ['disgusting', 'gross', 'revolting', 'sick', 'nauseating', 'repulsive',
-                     'awful', 'horrible']
     }
-    # Count emotion indicators
     emotion_scores = {}
     for emotion, keywords in emotions.items():
         score = sum(1 for keyword in keywords if keyword in text_lower)
         if score > 0:
             emotion_scores[emotion] = score
-    # Return the emotion with highest score, or neutral if none found
     if emotion_scores:
         return max(emotion_scores, key=emotion_scores.get)
     return 'neutral'
 def speech_to_text_with_emotion(audio_input):
-    """FIXED STT function with proper audio processing"""
     try:
         if audio_input is None:
             return "", "neutral"
         print("🎤 Processing audio input...")
-        # Process audio input with enhanced handling
         if isinstance(audio_input, tuple):
             sample_rate, audio_data = audio_input
-            print(f"Audio input: sample_rate={sample_rate}, shape={audio_data.shape}, dtype={audio_data.dtype}")
             # Handle different audio formats
             if audio_data.dtype == np.int16:
@@ -193,29 +181,25 @@ def speech_to_text_with_emotion(audio_input):
             audio_data = audio_input
             sample_rate = 16000
-        # Validate audio length
-        if len(audio_data) < 1600:  # Less than 0.1 seconds at 16kHz
             return "Audio too short, please speak for at least 1 second", "neutral"
-        # Check for silence (audio with very low amplitude)
         max_amplitude = np.max(np.abs(audio_data))
-        if max_amplitude < 0.01:  # Very quiet audio
             return "Audio too quiet, please speak louder", "neutral"
         # Normalize audio
         if max_amplitude > 0:
             audio_data = audio_data / max_amplitude * 0.95
-        # Resample to 16kHz if needed (Whisper expects 16kHz)
         if sample_rate != 16000:
             print(f"Resampling from {sample_rate}Hz to 16000Hz...")
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
-        print(f"Final audio: length={len(audio_data)}, max_amplitude={np.max(np.abs(audio_data)):.3f}")
-        # FIXED: Call ASR pipeline without sampling_rate parameter
         print("🔄 Running Whisper ASR...")
-        result = asr_pipe(audio_data)  # Removed sampling_rate parameter
         transcription = result['text'].strip()
         print(f"Transcription: '{transcription}'")
@@ -223,7 +207,6 @@ def speech_to_text_with_emotion(audio_input):
         if not transcription or len(transcription) < 2:
             return "No clear speech detected, please try speaking more clearly", "neutral"
-        # Detect emotion from transcription
         emotion = detect_emotion_from_text(transcription)
         print(f"Detected emotion: {emotion}")
@@ -234,39 +217,30 @@ def speech_to_text_with_emotion(audio_input):
         return "Sorry, I couldn't understand that. Please try again.", "neutral"
 def generate_contextual_response(user_input, emotion, conversation_manager):
-    """Enhanced response generation with better emotional intelligence"""
     try:
         context = conversation_manager.get_context()
-        # Enhanced emotional response styles
         emotional_prompts = {
-            "happy": "Respond with genuine enthusiasm and joy. Use positive language, show excitement, and celebrate with them. Be warm and energetic.",
-            "sad": "Respond with deep empathy and comfort. Be gentle, understanding, and supportive. Offer comfort and hope without being dismissive.",
-            "angry": "Respond calmly and try to help. Be patient, understanding, and try to de-escalate. Don't match their anger but acknowledge their feelings.",
-            "surprised": "Share in their surprise and show curiosity. Be engaging, interested, and ask thoughtful follow-up questions.",
-            "fearful": "Respond with reassurance and support. Be calming, protective, and offer practical help or comfort.",
-            "disgusted": "Respond with understanding while being helpful. Acknowledge their feelings and try to redirect positively.",
-            "neutral": "Respond naturally and conversationally. Be helpful, friendly, and engaging."
         }
-        system_prompt = f"""You are Maya, a highly emotionally intelligent AI assistant with a warm, caring personality.
 {emotional_prompts.get(emotion, emotional_prompts['neutral'])}
-Previous conversation context:
-{context}
-Current user emotion detected: {emotion}
 Guidelines:
-- Keep responses concise but meaningful (1-2 sentences)
-- Match the user's emotional tone appropriately
 - Be natural and conversational
-- Show genuine empathy and understanding
-- Provide helpful and relevant responses
-- Use natural speech patterns
-- If they seem distressed, offer support
-- If they're happy, celebrate with them
 """
         messages = [
@@ -274,11 +248,8 @@ Guidelines:
             {"role": "user", "content": user_input}
         ]
-        # Generate response with Qwen
         text = qwen_tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
         )
         model_inputs = qwen_tokenizer([text], return_tensors="pt")
@@ -301,9 +272,8 @@ Guidelines:
         ]
         response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # Clean up response
         response = response.strip()
         if response.startswith("Maya:"):
             response = response[5:].strip()
@@ -311,10 +281,10 @@ Guidelines:
     except Exception as e:
         print(f"Error in response generation: {e}")
-        return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
 def text_to_speech_emotional(text, emotion="neutral"):
-    """Enhanced TTS with Dia support"""
     try:
         if tts_model is None:
             print(f"🔊 Maya says ({emotion}): {text}")
@@ -325,21 +295,18 @@ def text_to_speech_emotional(text, emotion="neutral"):
             torch.cuda.empty_cache()
         if tts_type == "dia":
-            # Dia TTS with enhanced emotional markers
             emotional_markers = {
                 "happy": "(excited) ",
                 "sad": "(sad) ",
-                "angry": "(calm) ",  # Stay calm when user is angry
                 "surprised": "(surprised) ",
-                "fearful": "(reassuring) ",
-                "disgusted": "(understanding) ",
                 "neutral": ""
             }
-            # Enhanced text processing for Dia
             enhanced_text = f"[S1] {emotional_markers.get(emotion, '')}{text}"
-            # Add natural pauses for longer text
             if len(text) > 50:
                 enhanced_text = enhanced_text.replace(". ", ". (pause) ")
                 enhanced_text = enhanced_text.replace("! ", "! (pause) ")
@@ -354,25 +321,34 @@ def text_to_speech_emotional(text, emotion="neutral"):
                     verbose=False
                 )
-            # Process Dia output
             if isinstance(audio_output, torch.Tensor):
                 audio_output = audio_output.cpu().numpy()
-            # Normalize audio
             if len(audio_output) > 0:
                 max_val = np.max(np.abs(audio_output))
-                if max_val > 1.0:
                     audio_output = audio_output / max_val * 0.95
             return (44100, audio_output)
         else:
-            # Text-only mode
             print(f"🔊 Maya says ({emotion}): {text}")
             return None
     except Exception as e:
-        print(f"Error in TTS: {e}")
         print(f"🔊 Maya says ({emotion}): {text}")
         return None
@@ -382,53 +358,53 @@ conv_manager = ConversationManager()
 def start_call():
     """Initialize call and return greeting"""
     conv_manager.clear()
-    greeting_text = "Hello! I'm Maya, your AI assistant. I'm here to chat and help you with anything you need. How are you feeling today?"
     greeting_audio = text_to_speech_emotional(greeting_text, "happy")
     tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
-    return greeting_audio, greeting_text, f"📞 Call started! Maya is ready to chat. {tts_status}"
 def process_conversation(audio_input):
-    """Enhanced conversation processing pipeline"""
     if audio_input is None:
         return None, "Please record some audio first.", "", "❌ No audio input received."
     try:
         print("🔄 Processing conversation...")
-        # Step 1: Speech to Text + Emotion Detection
         user_text, emotion = speech_to_text_with_emotion(audio_input)
-        # Check for error messages from STT
         error_phrases = ["audio too short", "audio too quiet", "no clear speech", "sorry", "couldn't understand"]
         if any(phrase in user_text.lower() for phrase in error_phrases):
             return None, user_text, "", f"❌ STT Issue: {user_text}"
         if not user_text or user_text.strip() == "":
-            return None, "I didn't catch that clearly. Could you please speak a bit louder and closer to the microphone?", "", "❌ No speech detected."
-        # Step 2: Generate contextual response
         ai_response = generate_contextual_response(user_text, emotion, conv_manager)
-        # Step 3: Convert to speech
         response_audio = text_to_speech_emotional(ai_response, emotion)
-        # Step 4: Update conversation history
         conv_manager.add_exchange(user_text, ai_response, emotion)
-        status = f"✅ Processed successfully! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5 | TTS: {tts_type.upper()}"
         return response_audio, ai_response, user_text, status
     except Exception as e:
-        error_msg = f"❌ Error processing conversation: {str(e)}"
         print(error_msg)
         return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
 def get_conversation_history():
-    """Return formatted conversation history"""
     if not conv_manager.history:
-        return "No conversation history yet. Start a call to begin chatting with Maya!"
     history_text = "📋 **Conversation History:**\n\n"
     for i, exchange in enumerate(conv_manager.history, 1):
@@ -440,118 +416,84 @@ def get_conversation_history():
     return history_text
 def end_call():
-    """End call and clear conversation"""
-    farewell_text = "Thank you for our wonderful conversation! I really enjoyed talking with you. Take care and have an amazing day!"
     farewell_audio = text_to_speech_emotional(farewell_text, "happy")
     conv_manager.clear()
-    return farewell_audio, farewell_text, "📞❌ Call ended. Thank you for chatting with Maya!"
 def create_interface():
-    """Create enhanced Gradio interface"""
     with gr.Blocks(
-        title="Maya AI - Advanced Speech-to-Speech Assistant",
-        theme=gr.themes.Soft(),
-        css="""
-        .main-header {
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            border-radius: 15px;
-            padding: 25px;
-            text-align: center;
-            margin-bottom: 25px;
-            box-shadow: 0 8px 32px rgba(0,0,0,0.1);
-        }
-        .call-button {
-            background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
-            border: none !important;
-            box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
-        }
-        .process-button {
-            background: linear-gradient(45deg, #45B7D1, #96CEB4) !important;
-            border: none !important;
-            box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
-        }
-        .end-button {
-            background: linear-gradient(45deg, #FFA07A, #FF6347) !important;
-            border: none !important;
-            box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
-        }
-        """
     ) as demo:
         gr.HTML("""
-        <div class="main-header">
-            <h1 style="color: white; margin: 0; font-size: 2.8em; font-weight: bold;">🎙️ Maya AI</h1>
             <p style="color: white; margin: 15px 0; font-size: 1.3em;">Advanced Speech-to-Speech Conversational AI</p>
-            <p style="color: #E8E8E8; margin: 0; font-size: 1.1em;">Natural • Emotional • Contextual • Intelligent</p>
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=1):
-                # Call Controls
-                gr.HTML("<h3 style='color: #333; margin-bottom: 15px;'>📞 Call Controls</h3>")
-                start_btn = gr.Button("📞 Start Call", elem_classes="call-button", size="lg")
-                end_btn = gr.Button("📞❌ End Call", elem_classes="end-button", size="lg")
-                # Audio Input
-                gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>🎤 Voice Input</h3>")
                 audio_input = gr.Audio(
-                    label="Record Your Message (Speak clearly for at least 2 seconds)",
                     sources=["microphone"],
-                    type="numpy",
-                    format="wav"
                 )
-                process_btn = gr.Button("🎯 Process Message", elem_classes="process-button", variant="primary", size="lg")
-                # Status Display
                 status_display = gr.Textbox(
                     label="📊 System Status",
                     interactive=False,
                     lines=3,
-                    value="🚀 System ready! Click 'Start Call' to begin your conversation with Maya."
                 )
             with gr.Column(scale=2):
-                # AI Response Audio
-                gr.HTML("<h3 style='color: #333; margin-bottom: 15px;'>🔊 Maya's Response</h3>")
                 response_audio = gr.Audio(
                     label="Maya's Voice Response",
                     type="numpy",
                     interactive=False,
-                    autoplay=True
                 )
-                # Text Displays
                 with gr.Row():
                     with gr.Column():
                         user_text_display = gr.Textbox(
                             label="👤 What You Said",
                             interactive=False,
-                            lines=4,
-                            placeholder="Your speech will appear here after processing..."
                         )
                     with gr.Column():
                         ai_text_display = gr.Textbox(
                             label="🤖 Maya's Response",
                             interactive=False,
-                            lines=4,
-                            placeholder="Maya's response will appear here..."
                         )
-        # Conversation History Section
         with gr.Row():
             with gr.Column():
-                gr.HTML("<h3 style='color: #333; margin: 25px 0 15px 0;'>📋 Conversation History</h3>")
-                history_btn = gr.Button("📋 Show Conversation History", variant="secondary", size="lg")
-                history_display = gr.Markdown(
-                    value="No conversation history yet. Start a call to begin chatting with Maya!",
-                    label="Conversation Log"
-                )
-        # Event Handlers
         start_btn.click(
             fn=start_call,
             outputs=[response_audio, ai_text_display, status_display]
@@ -573,42 +515,26 @@ def create_interface():
             outputs=[history_display]
         )
-        # Enhanced Instructions
         gr.HTML("""
-        <div style="margin-top: 30px; padding: 25px; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); border-radius: 15px; border: 1px solid #dee2e6;">
-            <h3 style="color: #495057; margin-bottom: 20px;">💡 How to Use Maya AI:</h3>
-            <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
-                <div>
-                    <h4 style="color: #007bff;">🚀 Getting Started:</h4>
-                    <ol style="color: #495057;">
-                        <li><strong>Start Call:</strong> Click "📞 Start Call" to initialize Maya</li>
-                        <li><strong>Record:</strong> Speak clearly for at least 2 seconds</li>
-                        <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
-                        <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
-                        <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
-                        <li><strong>End:</strong> Click "📞❌ End Call" when finished</li>
-                    </ol>
-                </div>
-                <div>
-                    <h4 style="color: #28a745;">🎭 Features:</h4>
-                    <ul style="color: #495057;">
-                        <li>🎤 <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
-                        <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
-                        <li>🎭 <strong>Emotion Detection:</strong> Advanced emotion recognition</li>
-                        <li>🔊 <strong>Natural TTS:</strong> High-quality Dia TTS synthesis</li>
-                        <li>💭 <strong>Context Memory:</strong> Remembers conversation flow</li>
-                        <li>❤️ <strong>Emotional Intelligence:</strong> Responds to your emotions</li>
-                    </ul>
-                </div>
-            </div>
-            <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px; border-left: 4px solid #bee5eb;">
-                <p style="margin: 0; color: #0c5460;"><strong>💡 Pro Tips:</strong></p>
-                <ul style="color: #0c5460; margin: 10px 0;">
                     <li>Speak clearly and close to your microphone</li>
                     <li>Record for at least 2-3 seconds</li>
-                    <li>Speak in a quiet environment for best results</li>
-                    <li>Maya can detect emotions and respond accordingly!</li>
                 </ul>
             </div>
         </div>
@@ -619,7 +545,6 @@ def create_interface():
 if __name__ == "__main__":
     print("🚀 Initializing Maya AI System...")
-    # Check system info
     check_system_info()
     if load_models():
@@ -632,8 +557,7 @@ if __name__ == "__main__":
             server_name="0.0.0.0",
             server_port=7860,
             share=True,
-            show_error=True,
-            debug=False
         )
     else:
-        print("❌ Failed to load models. Please check the logs above for details.")

     print("🚀 Loading Maya AI models...")
+    # Load ASR model (Whisper)
     print("🎤 Loading Whisper for ASR...")
     try:
         asr_pipe = pipeline(
             model="openai/whisper-base",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device=0 if torch.cuda.is_available() else -1
         )
         print("✅ Whisper ASR loaded successfully!")
     except Exception as e:
             print(f"⚠️ Dia TTS failed to load: {e}")
             tts_model = None
     print("⚠️ No TTS available, running in text-only mode")
     tts_type = "none"
     return True
     """Enhanced emotion detection from text"""
     text_lower = text.lower()
     emotions = {
         'happy': ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing',
+                 'fantastic', 'excellent', 'brilliant', 'perfect', 'love', 'joy', 'cheerful'],
         'sad': ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed',
+               'miserable', 'heartbroken', 'devastated', 'gloomy', 'melancholy'],
         'angry': ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate', 'rage',
+                 'irritated', 'outraged', 'livid', 'enraged'],
         'surprised': ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking',
+                     'astonishing', 'remarkable', 'extraordinary', 'mind-blowing'],
+        'neutral': []
     }
     emotion_scores = {}
     for emotion, keywords in emotions.items():
         score = sum(1 for keyword in keywords if keyword in text_lower)
         if score > 0:
             emotion_scores[emotion] = score
     if emotion_scores:
         return max(emotion_scores, key=emotion_scores.get)
     return 'neutral'
 def speech_to_text_with_emotion(audio_input):
+    """Enhanced STT with proper audio processing"""
     try:
         if audio_input is None:
             return "", "neutral"
         print("🎤 Processing audio input...")
         if isinstance(audio_input, tuple):
             sample_rate, audio_data = audio_input
+            print(f"Audio input: sample_rate={sample_rate}, shape={audio_data.shape}")
             # Handle different audio formats
             if audio_data.dtype == np.int16:
             audio_data = audio_input
             sample_rate = 16000
+        # Validate audio
+        if len(audio_data) < 1600:
             return "Audio too short, please speak for at least 1 second", "neutral"
         max_amplitude = np.max(np.abs(audio_data))
+        if max_amplitude < 0.01:
             return "Audio too quiet, please speak louder", "neutral"
         # Normalize audio
         if max_amplitude > 0:
             audio_data = audio_data / max_amplitude * 0.95
+        # Resample to 16kHz if needed
         if sample_rate != 16000:
             print(f"Resampling from {sample_rate}Hz to 16000Hz...")
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         print("🔄 Running Whisper ASR...")
+        result = asr_pipe(audio_data)
         transcription = result['text'].strip()
         print(f"Transcription: '{transcription}'")
         if not transcription or len(transcription) < 2:
             return "No clear speech detected, please try speaking more clearly", "neutral"
         emotion = detect_emotion_from_text(transcription)
         print(f"Detected emotion: {emotion}")
         return "Sorry, I couldn't understand that. Please try again.", "neutral"
 def generate_contextual_response(user_input, emotion, conversation_manager):
+    """Enhanced response generation"""
     try:
         context = conversation_manager.get_context()
         emotional_prompts = {
+            "happy": "Respond with genuine enthusiasm and joy. Use positive language and show excitement.",
+            "sad": "Respond with empathy and comfort. Be gentle and understanding.",
+            "angry": "Respond calmly and try to help. Be patient and de-escalate.",
+            "surprised": "Share in their surprise and show curiosity. Be engaging.",
+            "neutral": "Respond naturally and conversationally. Be helpful and friendly."
         }
+        system_prompt = f"""You are Maya, a friendly AI assistant with emotional intelligence.
 {emotional_prompts.get(emotion, emotional_prompts['neutral'])}
+Previous context: {context}
+User emotion: {emotion}
 Guidelines:
+- Keep responses concise (1-2 sentences)
 - Be natural and conversational
+- Show empathy and understanding
+- Provide helpful responses
 """
         messages = [
             {"role": "user", "content": user_input}
         ]
         text = qwen_tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
         )
         model_inputs = qwen_tokenizer([text], return_tensors="pt")
         ]
         response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
         response = response.strip()
         if response.startswith("Maya:"):
             response = response[5:].strip()
     except Exception as e:
         print(f"Error in response generation: {e}")
+        return "I'm sorry, I'm having trouble processing that right now."
 def text_to_speech_emotional(text, emotion="neutral"):
+    """FIXED TTS with proper audio format for Gradio"""
     try:
         if tts_model is None:
             print(f"🔊 Maya says ({emotion}): {text}")
             torch.cuda.empty_cache()
         if tts_type == "dia":
             emotional_markers = {
                 "happy": "(excited) ",
                 "sad": "(sad) ",
+                "angry": "(calm) ",
                 "surprised": "(surprised) ",
                 "neutral": ""
             }
+            # Enhanced text for Dia
             enhanced_text = f"[S1] {emotional_markers.get(emotion, '')}{text}"
+            # Add pauses for natural speech
             if len(text) > 50:
                 enhanced_text = enhanced_text.replace(". ", ". (pause) ")
                 enhanced_text = enhanced_text.replace("! ", "! (pause) ")
                     verbose=False
                 )
+            # FIXED: Proper audio processing for Gradio
             if isinstance(audio_output, torch.Tensor):
                 audio_output = audio_output.cpu().numpy()
+            # Ensure audio is in the right format
+            if len(audio_output.shape) > 1:
+                audio_output = audio_output.squeeze()
+            # Normalize audio properly
             if len(audio_output) > 0:
                 max_val = np.max(np.abs(audio_output))
+                if max_val > 0:
                     audio_output = audio_output / max_val * 0.95
+            # CRITICAL FIX: Ensure audio is float32 and in correct range
+            audio_output = audio_output.astype(np.float32)
+            print(f"✅ Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
+            # Return in format Gradio expects: (sample_rate, audio_array)
             return (44100, audio_output)
         else:
             print(f"🔊 Maya says ({emotion}): {text}")
             return None
     except Exception as e:
+        print(f"❌ Error in TTS: {e}")
         print(f"🔊 Maya says ({emotion}): {text}")
         return None
 def start_call():
     """Initialize call and return greeting"""
     conv_manager.clear()
+    greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
     greeting_audio = text_to_speech_emotional(greeting_text, "happy")
     tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
+    return greeting_audio, greeting_text, f"📞 Call started! Maya is ready. {tts_status}"
 def process_conversation(audio_input):
+    """Main conversation processing pipeline"""
     if audio_input is None:
         return None, "Please record some audio first.", "", "❌ No audio input received."
     try:
         print("🔄 Processing conversation...")
+        # STT + Emotion Detection
         user_text, emotion = speech_to_text_with_emotion(audio_input)
+        # Check for STT errors
         error_phrases = ["audio too short", "audio too quiet", "no clear speech", "sorry", "couldn't understand"]
         if any(phrase in user_text.lower() for phrase in error_phrases):
             return None, user_text, "", f"❌ STT Issue: {user_text}"
         if not user_text or user_text.strip() == "":
+            return None, "I didn't catch that. Please speak louder and closer to the microphone.", "", "❌ No speech detected."
+        # Generate response
         ai_response = generate_contextual_response(user_text, emotion, conv_manager)
+        # Convert to speech
         response_audio = text_to_speech_emotional(ai_response, emotion)
+        # Update history
         conv_manager.add_exchange(user_text, ai_response, emotion)
+        status = f"✅ Success! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5 | TTS: {tts_type.upper()}"
         return response_audio, ai_response, user_text, status
     except Exception as e:
+        error_msg = f"❌ Error: {str(e)}"
         print(error_msg)
         return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
 def get_conversation_history():
+    """Return conversation history"""
     if not conv_manager.history:
+        return "No conversation history yet. Start a call to begin!"
     history_text = "📋 **Conversation History:**\n\n"
     for i, exchange in enumerate(conv_manager.history, 1):
     return history_text
 def end_call():
+    """End call"""
+    farewell_text = "Thank you for talking with me! Have a wonderful day!"
     farewell_audio = text_to_speech_emotional(farewell_text, "happy")
     conv_manager.clear()
+    return farewell_audio, farewell_text, "📞❌ Call ended. Thank you!"
 def create_interface():
+    """Create Gradio interface with FIXED audio components"""
     with gr.Blocks(
+        title="Maya AI - Speech-to-Speech Assistant",
+        theme=gr.themes.Soft()
     ) as demo:
         gr.HTML("""
+        <div style="text-align: center; padding: 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 25px;">
+            <h1 style="color: white; margin: 0; font-size: 2.8em;">🎙️ Maya AI</h1>
             <p style="color: white; margin: 15px 0; font-size: 1.3em;">Advanced Speech-to-Speech Conversational AI</p>
+            <p style="color: #E8E8E8; margin: 0;">Natural • Emotional • Contextual • Intelligent</p>
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=1):
+                gr.HTML("<h3>📞 Call Controls</h3>")
+                start_btn = gr.Button("📞 Start Call", variant="primary", size="lg")
+                end_btn = gr.Button("📞❌ End Call", variant="secondary", size="lg")
+                gr.HTML("<h3>🎤 Voice Input</h3>")
                 audio_input = gr.Audio(
+                    label="Record Your Message (Speak clearly for 2+ seconds)",
                     sources=["microphone"],
+                    type="numpy"
                 )
+                process_btn = gr.Button("🎯 Process Message", variant="primary", size="lg")
                 status_display = gr.Textbox(
                     label="📊 System Status",
                     interactive=False,
                     lines=3,
+                    value="🚀 Ready! Click 'Start Call' to begin."
                 )
             with gr.Column(scale=2):
+                gr.HTML("<h3>🔊 Maya's Response</h3>")
+                # FIXED: Audio component with proper settings
                 response_audio = gr.Audio(
                     label="Maya's Voice Response",
                     type="numpy",
                     interactive=False,
+                    autoplay=True,  # Enable autoplay
+                    show_download_button=True,
+                    show_share_button=False
                 )
                 with gr.Row():
                     with gr.Column():
                         user_text_display = gr.Textbox(
                             label="👤 What You Said",
                             interactive=False,
+                            lines=4
                         )
                     with gr.Column():
                         ai_text_display = gr.Textbox(
                             label="🤖 Maya's Response",
                             interactive=False,
+                            lines=4
                         )
         with gr.Row():
             with gr.Column():
+                gr.HTML("<h3>📋 Conversation History</h3>")
+                history_btn = gr.Button("📋 Show History", variant="secondary")
+                history_display = gr.Markdown("No conversation history yet.")
+        # Event handlers
         start_btn.click(
             fn=start_call,
             outputs=[response_audio, ai_text_display, status_display]
             outputs=[history_display]
         )
+        # Instructions
         gr.HTML("""
+        <div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
+            <h3>💡 How to Use Maya AI:</h3>
+            <ol>
+                <li><strong>Start Call:</strong> Click "📞 Start Call" - Maya will greet you</li>
+                <li><strong>Record:</strong> Speak clearly for at least 2 seconds</li>
+                <li><strong>Process:</strong> Click "🎯 Process Message"</li>
+                <li><strong>Listen:</strong> Maya will respond with natural speech</li>
+                <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges)</li>
+                <li><strong>End:</strong> Click "📞❌ End Call" when done</li>
+            </ol>
+            <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
+                <p><strong>💡 Pro Tips:</strong></p>
+                <ul>
                     <li>Speak clearly and close to your microphone</li>
                     <li>Record for at least 2-3 seconds</li>
+                    <li>Use a quiet environment for best results</li>
+                    <li>Maya detects emotions and responds accordingly!</li>
                 </ul>
             </div>
         </div>
 if __name__ == "__main__":
     print("🚀 Initializing Maya AI System...")
     check_system_info()
     if load_models():
             server_name="0.0.0.0",
             server_port=7860,
             share=True,
+            show_error=True
         )
     else:
+        print("❌ Failed to load models.")