Spaces:

Devakumar868
/

my-voice-assistant

Paused

App Files Files Community

Devakumar868 commited on Jun 22

Commit

ef9cdda

verified ·

1 Parent(s): aa73355

Update app.py

Browse files

Files changed (1) hide show

app.py +314 -150

app.py CHANGED Viewed

@@ -9,14 +9,25 @@ import json
 import time
 from datetime import datetime
 import os
-# Import TTS with fallback
 try:
     from TTS.api import TTS
-    TTS_AVAILABLE = True
 except ImportError:
-    print("⚠️ TTS not available, using text-only mode")
-    TTS_AVAILABLE = False
 warnings.filterwarnings("ignore")
@@ -25,7 +36,7 @@ asr_pipe = None
 qwen_model = None
 qwen_tokenizer = None
 tts_model = None
-conversation_history = []
 class ConversationManager:
     def __init__(self, max_exchanges=5):
@@ -54,20 +65,34 @@ class ConversationManager:
         self.history = []
         self.current_emotion = "neutral"
 def load_models():
-    """Load all models with proper error handling"""
-    global asr_pipe, qwen_model, qwen_tokenizer, tts_model
-    print("🚀 Loading models...")
-    # Load ASR model
     print("🎤 Loading Whisper for ASR...")
     try:
         asr_pipe = pipeline(
             "automatic-speech-recognition",
             model="openai/whisper-base",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device=0 if torch.cuda.is_available() else -1
         )
         print("✅ Whisper ASR loaded successfully!")
     except Exception as e:
@@ -86,79 +111,132 @@ def load_models():
             model_name,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="auto" if torch.cuda.is_available() else None,
-            trust_remote_code=True
         )
         print("✅ Qwen loaded successfully!")
     except Exception as e:
         print(f"❌ Error loading Qwen: {e}")
         return False
-    # Load TTS model
     print("🎙️ Loading TTS model...")
-    if TTS_AVAILABLE:
         try:
-            # Use Coqui TTS with a good female voice
             tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
             if torch.cuda.is_available():
                 tts_model = tts_model.to("cuda")
-            print("✅ TTS loaded successfully!")
         except Exception as e:
-            print(f"⚠️ TTS failed to load: {e}")
             tts_model = None
-    else:
-        print("⚠️ TTS not available, using text-only mode")
-        tts_model = None
     return True
 def detect_emotion_from_text(text):
-    """Simple emotion detection from text"""
     text_lower = text.lower()
-    # Emotion keywords
-    if any(word in text_lower for word in ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing', 'fantastic']):
-        return 'happy'
-    elif any(word in text_lower for word in ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed']):
-        return 'sad'
-    elif any(word in text_lower for word in ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate']):
-        return 'angry'
-    elif any(word in text_lower for word in ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking']):
-        return 'surprised'
-    else:
-        return 'neutral'
 def speech_to_text_with_emotion(audio_input):
-    """Convert speech to text and detect emotion"""
     try:
         if audio_input is None:
             return "", "neutral"
-        # Process audio input
         if isinstance(audio_input, tuple):
             sample_rate, audio_data = audio_input
-            # Convert to float32 and handle stereo
-            if audio_data.dtype != np.float32:
                 audio_data = audio_data.astype(np.float32)
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=1)
         else:
             audio_data = audio_input
             sample_rate = 16000
         # Normalize audio
         if len(audio_data) > 0:
             max_val = np.max(np.abs(audio_data))
             if max_val > 0:
-                audio_data = audio_data / max_val
         # Resample to 16kHz if needed
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
-        # Speech to text
         result = asr_pipe(audio_data, sampling_rate=16000)
         transcription = result['text'].strip()
         # Detect emotion from transcription
         emotion = detect_emotion_from_text(transcription)
@@ -169,41 +247,47 @@ def speech_to_text_with_emotion(audio_input):
         return "Sorry, I couldn't understand that.", "neutral"
 def generate_contextual_response(user_input, emotion, conversation_manager):
-    """Generate contextual response using Qwen"""
     try:
         context = conversation_manager.get_context()
-        # Emotional response styles
         emotional_prompts = {
-            "happy": "Respond with enthusiasm and joy. Use positive language and show excitement.",
-            "sad": "Respond with empathy and comfort. Be gentle, understanding, and supportive.",
-            "angry": "Respond calmly and try to help. Be patient and de-escalate the situation.",
-            "surprised": "Share in the surprise and show curiosity. Be engaging and interested.",
-            "neutral": "Respond naturally and conversationally. Be helpful and friendly."
         }
-        system_prompt = f"""You are Maya, a friendly and emotionally intelligent AI assistant.
-        {emotional_prompts.get(emotion, emotional_prompts['neutral'])}
-        Previous conversation context:
-        {context}
-        Current user emotion: {emotion}
-        Guidelines:
-        - Keep responses concise (1-2 sentences maximum)
-        - Match the user's emotional tone appropriately
-        - Be natural and conversational
-        - Show empathy and understanding
-        - Provide helpful responses
-        """
         messages = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": user_input}
         ]
-        # Generate response
         text = qwen_tokenizer.apply_chat_template(
             messages,
             tokenize=False,
@@ -217,10 +301,11 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
         with torch.no_grad():
             generated_ids = qwen_model.generate(
                 model_inputs.input_ids,
-                max_new_tokens=80,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.9,
                 pad_token_id=qwen_tokenizer.eos_token_id
             )
@@ -230,14 +315,19 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
         response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        return response.strip()
     except Exception as e:
         print(f"Error in response generation: {e}")
         return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
 def text_to_speech_emotional(text, emotion="neutral"):
-    """Convert text to speech with emotional context"""
     try:
         if tts_model is None:
             print(f"🔊 Maya says ({emotion}): {text}")
@@ -247,36 +337,85 @@ def text_to_speech_emotional(text, emotion="neutral"):
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        # Add emotional context to text
-        emotional_prefixes = {
-            "happy": "[Speaking with joy] ",
-            "sad": "[Speaking gently] ",
-            "angry": "[Speaking calmly] ",
-            "surprised": "[Speaking with excitement] ",
-            "neutral": ""
-        }
-        enhanced_text = f"{emotional_prefixes.get(emotion, '')}{text}"
-        print(f"Generating TTS for: {enhanced_text}")
-        # Generate audio
-        audio_output = tts_model.tts(text=enhanced_text)
-        # Convert to numpy array if needed
-        if isinstance(audio_output, list):
-            audio_output = np.array(audio_output, dtype=np.float32)
-        elif torch.is_tensor(audio_output):
-            audio_output = audio_output.cpu().numpy().astype(np.float32)
-        # Normalize audio
-        if len(audio_output) > 0:
-            max_val = np.max(np.abs(audio_output))
-            if max_val > 1.0:
-                audio_output = audio_output / max_val * 0.95
-        return (22050, audio_output)  # Return sample rate and audio data
     except Exception as e:
         print(f"Error in TTS: {e}")
         print(f"🔊 Maya says ({emotion}): {text}")
@@ -288,13 +427,14 @@ conv_manager = ConversationManager()
 def start_call():
     """Initialize call and return greeting"""
     conv_manager.clear()
-    greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
     greeting_audio = text_to_speech_emotional(greeting_text, "happy")
-    return greeting_audio, greeting_text, "Call started! 📞 Ready to chat!"
 def process_conversation(audio_input):
-    """Main conversation processing pipeline"""
     if audio_input is None:
         return None, "Please record some audio first.", "", "❌ No audio input received."
@@ -302,8 +442,8 @@ def process_conversation(audio_input):
         # Step 1: Speech to Text + Emotion Detection
         user_text, emotion = speech_to_text_with_emotion(audio_input)
-        if not user_text or user_text.strip() == "":
-            return None, "I didn't catch that. Could you please repeat?", "", "❌ No speech detected."
         # Step 2: Generate contextual response
         ai_response = generate_contextual_response(user_text, emotion, conv_manager)
@@ -314,18 +454,19 @@ def process_conversation(audio_input):
         # Step 4: Update conversation history
         conv_manager.add_exchange(user_text, ai_response, emotion)
-        status = f"✅ Processed successfully! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
         return response_audio, ai_response, user_text, status
     except Exception as e:
         error_msg = f"❌ Error processing conversation: {str(e)}"
         return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
 def get_conversation_history():
     """Return formatted conversation history"""
     if not conv_manager.history:
-        return "No conversation history yet. Start a call to begin chatting!"
     history_text = "📋 **Conversation History:**\n\n"
     for i, exchange in enumerate(conv_manager.history, 1):
@@ -338,71 +479,86 @@ def get_conversation_history():
 def end_call():
     """End call and clear conversation"""
-    farewell_text = "Thank you for talking with me! Have a wonderful day!"
     farewell_audio = text_to_speech_emotional(farewell_text, "happy")
     conv_manager.clear()
-    return farewell_audio, farewell_text, "Call ended. 📞❌ Thanks for chatting!"
 def create_interface():
-    """Create the Gradio interface"""
     with gr.Blocks(
-        title="Maya AI - Speech-to-Speech Assistant",
         theme=gr.themes.Soft(),
         css="""
         .main-header {
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             border-radius: 15px;
-            padding: 20px;
             text-align: center;
-            margin-bottom: 20px;
         }
-        .call-button { background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important; }
-        .process-button { background: linear-gradient(45deg, #45B7D1, #96CEB4) !important; }
-        .end-button { background: linear-gradient(45deg, #FFA07A, #FF6347) !important; }
         """
     ) as demo:
         gr.HTML("""
         <div class="main-header">
-            <h1 style="color: white; margin: 0; font-size: 2.5em;">🎙️ Maya AI</h1>
-            <p style="color: white; margin: 10px 0; font-size: 1.2em;">Advanced Speech-to-Speech Conversational AI</p>
-            <p style="color: #E8E8E8; margin: 0;">Natural • Emotional • Contextual</p>
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 # Call Controls
-                gr.HTML("<h3>📞 Call Controls</h3>")
                 start_btn = gr.Button("📞 Start Call", elem_classes="call-button", size="lg")
                 end_btn = gr.Button("📞❌ End Call", elem_classes="end-button", size="lg")
                 # Audio Input
-                gr.HTML("<h3>🎤 Voice Input</h3>")
                 audio_input = gr.Audio(
                     label="Record Your Message",
                     sources=["microphone"],
-                    type="numpy"
                 )
                 process_btn = gr.Button("🎯 Process Message", elem_classes="process-button", variant="primary", size="lg")
                 # Status Display
                 status_display = gr.Textbox(
-                    label="📊 Status",
                     interactive=False,
-                    lines=2,
-                    value="Ready to start! Click 'Start Call' to begin."
                 )
             with gr.Column(scale=2):
                 # AI Response Audio
-                gr.HTML("<h3>🔊 Maya's Response</h3>")
                 response_audio = gr.Audio(
                     label="Maya's Voice Response",
                     type="numpy",
-                    interactive=False
                 )
                 # Text Displays
@@ -411,25 +567,25 @@ def create_interface():
                         user_text_display = gr.Textbox(
                             label="👤 What You Said",
                             interactive=False,
-                            lines=3,
-                            placeholder="Your speech will appear here..."
                         )
                     with gr.Column():
                         ai_text_display = gr.Textbox(
                             label="🤖 Maya's Response",
                             interactive=False,
-                            lines=3,
                             placeholder="Maya's response will appear here..."
                         )
         # Conversation History Section
         with gr.Row():
             with gr.Column():
-                gr.HTML("<h3>📋 Conversation History</h3>")
-                history_btn = gr.Button("📋 Show History", variant="secondary")
                 history_display = gr.Markdown(
-                    value="No conversation history yet. Start a call to begin chatting!",
                     label="Conversation Log"
                 )
@@ -455,27 +611,38 @@ def create_interface():
             outputs=[history_display]
         )
-        # Instructions
         gr.HTML("""
-        <div style="margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 10px; border-left: 5px solid #007bff;">
-            <h3>💡 How to Use Maya AI:</h3>
-            <ol>
-                <li><strong>Start Call:</strong> Click "📞 Start Call" to initialize Maya</li>
-                <li><strong>Record:</strong> Use the microphone to record your message</li>
-                <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
-                <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
-                <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
-                <li><strong>End:</strong> Click "📞❌ End Call" when finished</li>
-            </ol>
-            <h4>🎭 Features:</h4>
-            <ul>
-                <li>🎤 <strong>Speech Recognition:</strong> Powered by Whisper</li>
-                <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B</li>
-                <li>🎭 <strong>Emotion Detection:</strong> Automatic emotion recognition</li>
-                <li>🔊 <strong>Natural Speech:</strong> High-quality TTS with emotions</li>
-                <li>💭 <strong>Context Memory:</strong> Remembers conversation flow</li>
-            </ul>
         </div>
         """)
@@ -483,16 +650,13 @@ def create_interface():
 if __name__ == "__main__":
     print("🚀 Initializing Maya AI System...")
-    print("🔧 Checking GPU availability...")
-    if torch.cuda.is_available():
-        print(f"✅ GPU detected: {torch.cuda.get_device_name()}")
-        print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
-    else:
-        print("⚠️ No GPU detected, using CPU")
     if load_models():
         print("✅ All models loaded successfully!")
         print("🌟 Launching Maya AI Interface...")
         demo = create_interface()

 import time
 from datetime import datetime
 import os
+import sys
+# Import with enhanced error handling
+try:
+    from dia.model import Dia
+    DIA_AVAILABLE = True
+    print("✅ Dia TTS library imported successfully")
+except ImportError as e:
+    print(f"⚠️ Dia TTS not available: {e}")
+    DIA_AVAILABLE = False
+# Fallback TTS import
 try:
     from TTS.api import TTS
+    COQUI_TTS_AVAILABLE = True
+    print("✅ Coqui TTS library available as fallback")
 except ImportError:
+    COQUI_TTS_AVAILABLE = False
+    print("⚠️ Coqui TTS not available")
 warnings.filterwarnings("ignore")
 qwen_model = None
 qwen_tokenizer = None
 tts_model = None
+tts_type = None  # Track which TTS model is loaded
 class ConversationManager:
     def __init__(self, max_exchanges=5):
         self.history = []
         self.current_emotion = "neutral"
+def check_system_info():
+    """Check system capabilities"""
+    print("🔍 System Information:")
+    print(f"Python: {sys.version}")
+    print(f"PyTorch: {torch.__version__}")
+    if torch.cuda.is_available():
+        print(f"✅ CUDA: {torch.cuda.get_device_name()}")
+        print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+        print(f"🔥 CUDA Version: {torch.version.cuda}")
+    else:
+        print("⚠️ CUDA not available, using CPU")
 def load_models():
+    """Load all models with enhanced error handling"""
+    global asr_pipe, qwen_model, qwen_tokenizer, tts_model, tts_type
+    print("🚀 Loading Maya AI models...")
+    # Load ASR model (Whisper)
     print("🎤 Loading Whisper for ASR...")
     try:
         asr_pipe = pipeline(
             "automatic-speech-recognition",
             model="openai/whisper-base",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device=0 if torch.cuda.is_available() else -1,
+            return_timestamps=False
         )
         print("✅ Whisper ASR loaded successfully!")
     except Exception as e:
             model_name,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="auto" if torch.cuda.is_available() else None,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True
         )
         print("✅ Qwen loaded successfully!")
     except Exception as e:
         print(f"❌ Error loading Qwen: {e}")
         return False
+    # Load TTS model with priority: Dia > Coqui > Text-only
     print("🎙️ Loading TTS model...")
+    # Try Dia TTS first (preferred)
+    if DIA_AVAILABLE:
         try:
+            print("Attempting to load Dia TTS...")
+            tts_model = Dia.from_pretrained(
+                "nari-labs/Dia-1.6B",
+                compute_dtype="float16" if torch.cuda.is_available() else "float32"
+            )
+            tts_type = "dia"
+            print("✅ Dia TTS loaded successfully!")
+            return True
+        except Exception as e:
+            print(f"⚠️ Dia TTS failed to load: {e}")
+            tts_model = None
+    # Fallback to Coqui TTS
+    if COQUI_TTS_AVAILABLE:
+        try:
+            print("Attempting to load Coqui TTS as fallback...")
             tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
             if torch.cuda.is_available():
                 tts_model = tts_model.to("cuda")
+            tts_type = "coqui"
+            print("✅ Coqui TTS loaded successfully!")
+            return True
         except Exception as e:
+            print(f"⚠️ Coqui TTS failed to load: {e}")
             tts_model = None
+    # Continue without TTS (text-only mode)
+    print("⚠️ No TTS available, running in text-only mode")
+    tts_type = "none"
     return True
 def detect_emotion_from_text(text):
+    """Enhanced emotion detection from text"""
     text_lower = text.lower()
+    # Enhanced emotion keywords with weights
+    emotions = {
+        'happy': ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing',
+                 'fantastic', 'excellent', 'brilliant', 'perfect', 'love', 'joy', 'cheerful',
+                 'delighted', 'thrilled', 'ecstatic'],
+        'sad': ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed',
+               'miserable', 'heartbroken', 'devastated', 'gloomy', 'melancholy', 'down',
+               'blue', 'sorrowful'],
+        'angry': ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate', 'rage',
+                 'irritated', 'outraged', 'livid', 'enraged', 'pissed', 'irate'],
+        'surprised': ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking',
+                     'astonishing', 'remarkable', 'extraordinary', 'mind-blowing',
+                     'amazing', 'stunning'],
+        'fearful': ['scared', 'afraid', 'terrified', 'worried', 'anxious', 'nervous',
+                   'frightened', 'panic', 'concerned', 'fearful'],
+        'disgusted': ['disgusting', 'gross', 'revolting', 'sick', 'nauseating', 'repulsive',
+                     'awful', 'horrible']
+    }
+    # Count emotion indicators
+    emotion_scores = {}
+    for emotion, keywords in emotions.items():
+        score = sum(1 for keyword in keywords if keyword in text_lower)
+        if score > 0:
+            emotion_scores[emotion] = score
+    # Return the emotion with highest score, or neutral if none found
+    if emotion_scores:
+        return max(emotion_scores, key=emotion_scores.get)
+    return 'neutral'
 def speech_to_text_with_emotion(audio_input):
+    """Enhanced STT with better audio processing"""
     try:
         if audio_input is None:
             return "", "neutral"
+        # Process audio input with better handling
         if isinstance(audio_input, tuple):
             sample_rate, audio_data = audio_input
+            # Handle different audio formats
+            if audio_data.dtype == np.int16:
+                audio_data = audio_data.astype(np.float32) / 32768.0
+            elif audio_data.dtype == np.int32:
+                audio_data = audio_data.astype(np.float32) / 2147483648.0
+            elif audio_data.dtype != np.float32:
                 audio_data = audio_data.astype(np.float32)
+            # Handle stereo audio
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=1)
         else:
             audio_data = audio_input
             sample_rate = 16000
+        # Validate audio length
+        if len(audio_data) < 1600:  # Less than 0.1 seconds at 16kHz
+            return "Audio too short, please speak longer", "neutral"
         # Normalize audio
         if len(audio_data) > 0:
             max_val = np.max(np.abs(audio_data))
             if max_val > 0:
+                audio_data = audio_data / max_val * 0.95
         # Resample to 16kHz if needed
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+        # Speech to text with Whisper
         result = asr_pipe(audio_data, sampling_rate=16000)
         transcription = result['text'].strip()
+        if not transcription:
+            return "No speech detected", "neutral"
         # Detect emotion from transcription
         emotion = detect_emotion_from_text(transcription)
         return "Sorry, I couldn't understand that.", "neutral"
 def generate_contextual_response(user_input, emotion, conversation_manager):
+    """Enhanced response generation with better emotional intelligence"""
     try:
         context = conversation_manager.get_context()
+        # Enhanced emotional response styles
         emotional_prompts = {
+            "happy": "Respond with genuine enthusiasm and joy. Use positive language, show excitement, and celebrate with them. Be warm and energetic.",
+            "sad": "Respond with deep empathy and comfort. Be gentle, understanding, and supportive. Offer comfort and hope without being dismissive.",
+            "angry": "Respond calmly and try to help. Be patient, understanding, and try to de-escalate. Don't match their anger but acknowledge their feelings.",
+            "surprised": "Share in their surprise and show curiosity. Be engaging, interested, and ask thoughtful follow-up questions.",
+            "fearful": "Respond with reassurance and support. Be calming, protective, and offer practical help or comfort.",
+            "disgusted": "Respond with understanding while being helpful. Acknowledge their feelings and try to redirect positively.",
+            "neutral": "Respond naturally and conversationally. Be helpful, friendly, and engaging."
         }
+        system_prompt = f"""You are Maya, a highly emotionally intelligent AI assistant with a warm, caring personality.
+{emotional_prompts.get(emotion, emotional_prompts['neutral'])}
+Previous conversation context:
+{context}
+Current user emotion detected: {emotion}
+Guidelines:
+- Keep responses concise but meaningful (1-2 sentences)
+- Match the user's emotional tone appropriately
+- Be natural and conversational
+- Show genuine empathy and understanding
+- Provide helpful and relevant responses
+- Use natural speech patterns
+- If they seem distressed, offer support
+- If they're happy, celebrate with them
+"""
         messages = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": user_input}
         ]
+        # Generate response with Qwen
         text = qwen_tokenizer.apply_chat_template(
             messages,
             tokenize=False,
         with torch.no_grad():
             generated_ids = qwen_model.generate(
                 model_inputs.input_ids,
+                max_new_tokens=100,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.9,
+                repetition_penalty=1.1,
                 pad_token_id=qwen_tokenizer.eos_token_id
             )
         response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # Clean up response
+        response = response.strip()
+        if response.startswith("Maya:"):
+            response = response[5:].strip()
+        return response
     except Exception as e:
         print(f"Error in response generation: {e}")
         return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
 def text_to_speech_emotional(text, emotion="neutral"):
+    """Enhanced TTS with support for both Dia and Coqui"""
     try:
         if tts_model is None:
             print(f"🔊 Maya says ({emotion}): {text}")
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        if tts_type == "dia":
+            # Dia TTS with enhanced emotional markers
+            emotional_markers = {
+                "happy": "(excited) ",
+                "sad": "(sad) ",
+                "angry": "(calm) ",  # Stay calm when user is angry
+                "surprised": "(surprised) ",
+                "fearful": "(reassuring) ",
+                "disgusted": "(understanding) ",
+                "neutral": ""
+            }
+            # Enhanced text processing for Dia
+            enhanced_text = f"[S1] {emotional_markers.get(emotion, '')}{text}"
+            # Add natural pauses for longer text
+            if len(text) > 50:
+                enhanced_text = enhanced_text.replace(". ", ". (pause) ")
+                enhanced_text = enhanced_text.replace("! ", "! (pause) ")
+                enhanced_text = enhanced_text.replace("? ", "? (pause) ")
+            print(f"Generating Dia TTS for: {enhanced_text}")
+            with torch.no_grad():
+                audio_output = tts_model.generate(
+                    enhanced_text,
+                    use_torch_compile=False,
+                    verbose=False
+                )
+            # Process Dia output
+            if isinstance(audio_output, torch.Tensor):
+                audio_output = audio_output.cpu().numpy()
+            # Normalize audio
+            if len(audio_output) > 0:
+                max_val = np.max(np.abs(audio_output))
+                if max_val > 1.0:
+                    audio_output = audio_output / max_val * 0.95
+            return (44100, audio_output)
+        elif tts_type == "coqui":
+            # Coqui TTS processing
+            emotional_prefixes = {
+                "happy": "[Speaking with joy] ",
+                "sad": "[Speaking gently] ",
+                "angry": "[Speaking calmly] ",
+                "surprised": "[Speaking with excitement] ",
+                "fearful": "[Speaking reassuringly] ",
+                "disgusted": "[Speaking understandingly] ",
+                "neutral": ""
+            }
+            enhanced_text = f"{emotional_prefixes.get(emotion, '')}{text}"
+            print(f"Generating Coqui TTS for: {enhanced_text}")
+            audio_output = tts_model.tts(text=enhanced_text)
+            # Convert to numpy array if needed
+            if isinstance(audio_output, list):
+                audio_output = np.array(audio_output, dtype=np.float32)
+            elif torch.is_tensor(audio_output):
+                audio_output = audio_output.cpu().numpy().astype(np.float32)
+            # Normalize audio
+            if len(audio_output) > 0:
+                max_val = np.max(np.abs(audio_output))
+                if max_val > 1.0:
+                    audio_output = audio_output / max_val * 0.95
+            return (22050, audio_output)
+        else:
+            # Text-only mode
+            print(f"🔊 Maya says ({emotion}): {text}")
+            return None
     except Exception as e:
         print(f"Error in TTS: {e}")
         print(f"🔊 Maya says ({emotion}): {text}")
 def start_call():
     """Initialize call and return greeting"""
     conv_manager.clear()
+    greeting_text = "Hello! I'm Maya, your AI assistant. I'm here to chat and help you with anything you need. How are you feeling today?"
     greeting_audio = text_to_speech_emotional(greeting_text, "happy")
+    tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
+    return greeting_audio, greeting_text, f"📞 Call started! Maya is ready to chat. {tts_status}"
 def process_conversation(audio_input):
+    """Enhanced conversation processing pipeline"""
     if audio_input is None:
         return None, "Please record some audio first.", "", "❌ No audio input received."
         # Step 1: Speech to Text + Emotion Detection
         user_text, emotion = speech_to_text_with_emotion(audio_input)
+        if not user_text or user_text.strip() == "" or "sorry" in user_text.lower():
+            return None, "I didn't catch that clearly. Could you please speak a bit louder or closer to the microphone?", "", "❌ No clear speech detected."
         # Step 2: Generate contextual response
         ai_response = generate_contextual_response(user_text, emotion, conv_manager)
         # Step 4: Update conversation history
         conv_manager.add_exchange(user_text, ai_response, emotion)
+        status = f"✅ Processed successfully! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5 | TTS: {tts_type.upper()}"
         return response_audio, ai_response, user_text, status
     except Exception as e:
         error_msg = f"❌ Error processing conversation: {str(e)}"
+        print(error_msg)
         return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
 def get_conversation_history():
     """Return formatted conversation history"""
     if not conv_manager.history:
+        return "No conversation history yet. Start a call to begin chatting with Maya!"
     history_text = "📋 **Conversation History:**\n\n"
     for i, exchange in enumerate(conv_manager.history, 1):
 def end_call():
     """End call and clear conversation"""
+    farewell_text = "Thank you for our wonderful conversation! I really enjoyed talking with you. Take care and have an amazing day!"
     farewell_audio = text_to_speech_emotional(farewell_text, "happy")
     conv_manager.clear()
+    return farewell_audio, farewell_text, "📞❌ Call ended. Thank you for chatting with Maya!"
 def create_interface():
+    """Create enhanced Gradio interface"""
     with gr.Blocks(
+        title="Maya AI - Advanced Speech-to-Speech Assistant",
         theme=gr.themes.Soft(),
         css="""
         .main-header {
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             border-radius: 15px;
+            padding: 25px;
             text-align: center;
+            margin-bottom: 25px;
+            box-shadow: 0 8px 32px rgba(0,0,0,0.1);
+        }
+        .call-button {
+            background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
+            border: none !important;
+            box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
+        }
+        .process-button {
+            background: linear-gradient(45deg, #45B7D1, #96CEB4) !important;
+            border: none !important;
+            box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
+        }
+        .end-button {
+            background: linear-gradient(45deg, #FFA07A, #FF6347) !important;
+            border: none !important;
+            box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
         }
         """
     ) as demo:
         gr.HTML("""
         <div class="main-header">
+            <h1 style="color: white; margin: 0; font-size: 2.8em; font-weight: bold;">🎙️ Maya AI</h1>
+            <p style="color: white; margin: 15px 0; font-size: 1.3em;">Advanced Speech-to-Speech Conversational AI</p>
+            <p style="color: #E8E8E8; margin: 0; font-size: 1.1em;">Natural • Emotional • Contextual • Intelligent</p>
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 # Call Controls
+                gr.HTML("<h3 style='color: #333; margin-bottom: 15px;'>📞 Call Controls</h3>")
                 start_btn = gr.Button("📞 Start Call", elem_classes="call-button", size="lg")
                 end_btn = gr.Button("📞❌ End Call", elem_classes="end-button", size="lg")
                 # Audio Input
+                gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>🎤 Voice Input</h3>")
                 audio_input = gr.Audio(
                     label="Record Your Message",
                     sources=["microphone"],
+                    type="numpy",
+                    format="wav"
                 )
                 process_btn = gr.Button("🎯 Process Message", elem_classes="process-button", variant="primary", size="lg")
                 # Status Display
                 status_display = gr.Textbox(
+                    label="📊 System Status",
                     interactive=False,
+                    lines=3,
+                    value="🚀 System ready! Click 'Start Call' to begin your conversation with Maya."
                 )
             with gr.Column(scale=2):
                 # AI Response Audio
+                gr.HTML("<h3 style='color: #333; margin-bottom: 15px;'>🔊 Maya's Response</h3>")
                 response_audio = gr.Audio(
                     label="Maya's Voice Response",
                     type="numpy",
+                    interactive=False,
+                    autoplay=True
                 )
                 # Text Displays
                         user_text_display = gr.Textbox(
                             label="👤 What You Said",
                             interactive=False,
+                            lines=4,
+                            placeholder="Your speech will appear here after processing..."
                         )
                     with gr.Column():
                         ai_text_display = gr.Textbox(
                             label="🤖 Maya's Response",
                             interactive=False,
+                            lines=4,
                             placeholder="Maya's response will appear here..."
                         )
         # Conversation History Section
         with gr.Row():
             with gr.Column():
+                gr.HTML("<h3 style='color: #333; margin: 25px 0 15px 0;'>📋 Conversation History</h3>")
+                history_btn = gr.Button("📋 Show Conversation History", variant="secondary", size="lg")
                 history_display = gr.Markdown(
+                    value="No conversation history yet. Start a call to begin chatting with Maya!",
                     label="Conversation Log"
                 )
             outputs=[history_display]
         )
+        # Enhanced Instructions
         gr.HTML("""
+        <div style="margin-top: 30px; padding: 25px; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); border-radius: 15px; border: 1px solid #dee2e6;">
+            <h3 style="color: #495057; margin-bottom: 20px;">💡 How to Use Maya AI:</h3>
+            <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
+                <div>
+                    <h4 style="color: #007bff;">🚀 Getting Started:</h4>
+                    <ol style="color: #495057;">
+                        <li><strong>Start Call:</strong> Click "📞 Start Call" to initialize Maya</li>
+                        <li><strong>Record:</strong> Use the microphone to record your message</li>
+                        <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
+                        <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
+                        <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
+                        <li><strong>End:</strong> Click "📞❌ End Call" when finished</li>
+                    </ol>
+                </div>
+                <div>
+                    <h4 style="color: #28a745;">🎭 Advanced Features:</h4>
+                    <ul style="color: #495057;">
+                        <li>🎤 <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
+                        <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
+                        <li>🎭 <strong>Emotion Detection:</strong> Advanced emotion recognition from speech</li>
+                        <li>🔊 <strong>Natural TTS:</strong> High-quality speech synthesis with Dia TTS</li>
+                        <li>💭 <strong>Context Memory:</strong> Remembers conversation flow and context</li>
+                        <li>❤️ <strong>Emotional Intelligence:</strong> Responds appropriately to your emotions</li>
+                    </ul>
+                </div>
+            </div>
+            <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px; border-left: 4px solid #bee5eb;">
+                <p style="margin: 0; color: #0c5460;"><strong>💡 Pro Tip:</strong> Speak clearly and naturally. Maya can detect emotions like happiness, sadness, anger, surprise, fear, and disgust, and will respond accordingly to provide the best conversational experience!</p>
+            </div>
         </div>
         """)
 if __name__ == "__main__":
     print("🚀 Initializing Maya AI System...")
+    # Check system info
+    check_system_info()
     if load_models():
         print("✅ All models loaded successfully!")
+        print(f"🎙️ TTS Mode: {tts_type.upper()}")
         print("🌟 Launching Maya AI Interface...")
         demo = create_interface()