Spaces:

Devakumar868
/

my-voice-assistant

Paused

App Files Files Community

Devakumar868 commited on Jun 22

Commit

ab6af92

verified ·

1 Parent(s): 20ec756

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -154

app.py CHANGED Viewed

@@ -4,33 +4,27 @@ import numpy as np
 import librosa
 import soundfile as sf
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
-from dia.model import Dia
 import warnings
 import json
 import time
 from datetime import datetime
 import os
 try:
-    from nari_tts import Dia
-    DIA_AVAILABLE = True
 except ImportError:
-    print("⚠️ Dia TTS not available, using fallback TTS")
-    DIA_AVAILABLE = False
-warnings.filterwarnings("ignore")
 warnings.filterwarnings("ignore")
 # Global models
-ultravox_pipe = None
 qwen_model = None
 qwen_tokenizer = None
-dia_model = None
 conversation_history = []
 class ConversationManager:
@@ -47,13 +41,12 @@ class ConversationManager:
             "emotion": emotion
         })
-        # Keep only last max_exchanges
         if len(self.history) > self.max_exchanges:
             self.history = self.history[-self.max_exchanges:]
     def get_context(self):
         context = ""
-        for exchange in self.history[-3:]:  # Last 3 exchanges for context
             context += f"User: {exchange['user']}\nAI: {exchange['ai']}\n"
         return context
@@ -62,32 +55,37 @@ class ConversationManager:
         self.current_emotion = "neutral"
 def load_models():
-    """Load all models with optimized memory usage"""
-    global ultravox_pipe, qwen_model, qwen_tokenizer, dia_model
-    print("🚀 Loading Ultravox for ASR + Emotion Recognition...")
     try:
-        ultravox_pipe = pipeline(
-            model='fixie-ai/ultravox-v0_4',
-            trust_remote_code=True,
-            torch_dtype=torch.float16,
-            device_map="auto"
         )
-        print("✅ Ultravox loaded successfully!")
     except Exception as e:
-        print(f"❌ Error loading Ultravox: {e}")
         return False
     print("🧠 Loading Qwen2.5-1.5B for conversation...")
     try:
         qwen_tokenizer = AutoTokenizer.from_pretrained(
-            "Qwen/Qwen2.5-1.5B-Instruct",
             trust_remote_code=True
         )
         qwen_model = AutoModelForCausalLM.from_pretrained(
-            "Qwen/Qwen2.5-1.5B-Instruct",
-            torch_dtype=torch.float16,
-            device_map="auto",
             trust_remote_code=True
         )
         print("✅ Qwen loaded successfully!")
@@ -95,52 +93,39 @@ def load_models():
         print(f"❌ Error loading Qwen: {e}")
         return False
-    print("🎙️ Loading Enhanced Dia TTS...")
-    try:
-        dia_model = Dia.from_pretrained(
-            "nari-labs/Dia-1.6B",
-            compute_dtype="float16"
-        )
-        print("✅ Dia TTS loaded successfully!")
-    except Exception as e:
-        print(f"❌ Error loading Dia: {e}")
-        return False
     return True
-def detect_emotion_from_speech(audio_input):
-    """Extract emotion from speech using Ultravox understanding"""
-    try:
-        # Emotional keywords mapping
-        emotion_keywords = {
-            "happy": ["laugh", "excited", "joy", "great", "awesome", "wonderful"],
-            "sad": ["cry", "upset", "disappointed", "sorry", "terrible"],
-            "angry": ["mad", "furious", "annoyed", "frustrated"],
-            "surprised": ["wow", "amazing", "incredible", "unbelievable"],
-            "neutral": []
-        }
-        # Use Ultravox to understand speech context
-        turns = [
-            {"role": "system", "content": "Analyze the emotional tone of the user's speech. Respond with just the emotion: happy, sad, angry, surprised, or neutral."},
-        ]
-        result = ultravox_pipe({
-            'audio': audio_input,
-            'turns': turns,
-            'sampling_rate': 16000
-        }, max_new_tokens=10)
-        detected_emotion = result[0]['generated_text'].lower().strip()
-        # Validate emotion
-        valid_emotions = ["happy", "sad", "angry", "surprised", "neutral"]
-        if detected_emotion not in valid_emotions:
-            detected_emotion = "neutral"
-        return detected_emotion
-    except:
-        return "neutral"
 def speech_to_text_with_emotion(audio_input):
     """Convert speech to text and detect emotion"""
@@ -148,35 +133,34 @@ def speech_to_text_with_emotion(audio_input):
         if audio_input is None:
             return "", "neutral"
-        # Convert audio format if needed
         if isinstance(audio_input, tuple):
             sample_rate, audio_data = audio_input
-            audio_data = audio_data.astype(np.float32)
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=1)
         else:
             audio_data = audio_input
             sample_rate = 16000
         # Resample to 16kHz if needed
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
-        # Speech to text using Ultravox
-        turns = [
-            {"role": "system", "content": "Transcribe the user's speech accurately. Only provide the transcription."},
-        ]
-        result = ultravox_pipe({
-            'audio': audio_data,
-            'turns': turns,
-            'sampling_rate': 16000
-        }, max_new_tokens=100)
-        transcription = result[0]['generated_text'].strip()
-        # Detect emotion
-        emotion = detect_emotion_from_speech(audio_data)
         return transcription, emotion
@@ -189,13 +173,13 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
     try:
         context = conversation_manager.get_context()
-        # Emotional system prompt
         emotional_prompts = {
-            "happy": "Respond with enthusiasm and joy. Use exclamations and positive language.",
-            "sad": "Respond with empathy and comfort. Be gentle and understanding.",
-            "angry": "Respond calmly and try to de-escalate. Be patient and helpful.",
-            "surprised": "Share in the surprise and excitement. Be engaging and curious.",
-            "neutral": "Respond naturally and conversationally."
         }
         system_prompt = f"""You are Maya, a friendly and emotionally intelligent AI assistant.
@@ -207,10 +191,11 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
         Current user emotion: {emotion}
         Guidelines:
-        - Keep responses concise (1-2 sentences)
-        - Match the user's emotional tone
         - Be natural and conversational
-        - Include emotional expressions when appropriate like (laughs), (sighs), etc.
         """
         messages = [
@@ -225,14 +210,17 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
             add_generation_prompt=True
         )
-        model_inputs = qwen_tokenizer([text], return_tensors="pt").to(qwen_model.device)
         with torch.no_grad():
             generated_ids = qwen_model.generate(
                 model_inputs.input_ids,
-                max_new_tokens=100,
                 do_sample=True,
                 temperature=0.7,
                 pad_token_id=qwen_tokenizer.eos_token_id
             )
@@ -246,46 +234,40 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
     except Exception as e:
         print(f"Error in response generation: {e}")
-        return "I'm sorry, I'm having trouble processing that right now."
-def text_to_speech_emotional(text, emotion="neutral", speaker="S1"):
-    """Convert text to emotional speech using enhanced Dia"""
     try:
         # Clear GPU cache
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        # Emotional markers for Dia
-        emotional_markers = {
-            "happy": "(excited) ",
-            "sad": "(sad) ",
-            "angry": "(frustrated) ",
-            "surprised": "(surprised) ",
             "neutral": ""
         }
-        # Add emotional context and natural pauses
-        enhanced_text = f"[{speaker}] {emotional_markers.get(emotion, '')}{text}"
-        # Add natural breathing pauses for longer text
-        if len(text) > 50:
-            enhanced_text = enhanced_text.replace(". ", ". (pause) ")
-            enhanced_text = enhanced_text.replace("! ", "! (pause) ")
-            enhanced_text = enhanced_text.replace("? ", "? (pause) ")
-        print(f"Generating TTS for: {enhanced_text[:100]}...")
         # Generate audio
-        with torch.no_grad():
-            audio_output = dia_model.generate(
-                enhanced_text,
-                use_torch_compile=False,
-                verbose=False
-            )
-        # Process audio output
-        if isinstance(audio_output, torch.Tensor):
-            audio_output = audio_output.cpu().numpy()
         # Normalize audio
         if len(audio_output) > 0:
@@ -293,10 +275,11 @@ def text_to_speech_emotional(text, emotion="neutral", speaker="S1"):
             if max_val > 1.0:
                 audio_output = audio_output / max_val * 0.95
-        return (44100, audio_output)
     except Exception as e:
         print(f"Error in TTS: {e}")
         return None
 # Initialize conversation manager
@@ -308,19 +291,19 @@ def start_call():
     greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
     greeting_audio = text_to_speech_emotional(greeting_text, "happy")
-    return greeting_audio, greeting_text, "Call started! 📞"
 def process_conversation(audio_input):
     """Main conversation processing pipeline"""
     if audio_input is None:
-        return None, "Please record some audio first.", "", "No audio input received."
     try:
         # Step 1: Speech to Text + Emotion Detection
         user_text, emotion = speech_to_text_with_emotion(audio_input)
         if not user_text or user_text.strip() == "":
-            return None, "I didn't catch that. Could you please repeat?", "", "No speech detected."
         # Step 2: Generate contextual response
         ai_response = generate_contextual_response(user_text, emotion, conv_manager)
@@ -331,7 +314,7 @@ def process_conversation(audio_input):
         # Step 4: Update conversation history
         conv_manager.add_exchange(user_text, ai_response, emotion)
-        status = f"✅ Processed | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
         return response_audio, ai_response, user_text, status
@@ -342,7 +325,7 @@ def process_conversation(audio_input):
 def get_conversation_history():
     """Return formatted conversation history"""
     if not conv_manager.history:
-        return "No conversation history yet."
     history_text = "📋 **Conversation History:**\n\n"
     for i, exchange in enumerate(conv_manager.history, 1):
@@ -355,26 +338,33 @@ def get_conversation_history():
 def end_call():
     """End call and clear conversation"""
-    farewell_text = "Thank you for talking with me! Have a great day!"
     farewell_audio = text_to_speech_emotional(farewell_text, "happy")
     conv_manager.clear()
-    return farewell_audio, farewell_text, "Call ended. 📞❌"
-# Create Gradio Interface
 def create_interface():
     with gr.Blocks(
-        title="Maya AI - Advanced Speech-to-Speech Assistant",
         theme=gr.themes.Soft(),
         css="""
         .call-button { background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important; }
-        .record-button { background: linear-gradient(45deg, #45B7D1, #96CEB4) !important; }
         .end-button { background: linear-gradient(45deg, #FFA07A, #FF6347) !important; }
         """
     ) as demo:
         gr.HTML("""
-        <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 20px;">
             <h1 style="color: white; margin: 0; font-size: 2.5em;">🎙️ Maya AI</h1>
             <p style="color: white; margin: 10px 0; font-size: 1.2em;">Advanced Speech-to-Speech Conversational AI</p>
             <p style="color: #E8E8E8; margin: 0;">Natural • Emotional • Contextual</p>
@@ -393,17 +383,17 @@ def create_interface():
                 audio_input = gr.Audio(
                     label="Record Your Message",
                     sources=["microphone"],
-                    type="numpy",
-                    elem_classes="record-button"
                 )
-                process_btn = gr.Button("🎯 Process Message", variant="primary", size="lg")
-                # Status
                 status_display = gr.Textbox(
                     label="📊 Status",
                     interactive=False,
-                    lines=2
                 )
             with gr.Column(scale=2):
@@ -421,23 +411,25 @@ def create_interface():
                         user_text_display = gr.Textbox(
                             label="👤 What You Said",
                             interactive=False,
-                            lines=3
                         )
                     with gr.Column():
                         ai_text_display = gr.Textbox(
                             label="🤖 Maya's Response",
                             interactive=False,
-                            lines=3
                         )
-        # Conversation History
         with gr.Row():
             with gr.Column():
                 gr.HTML("<h3>📋 Conversation History</h3>")
                 history_btn = gr.Button("📋 Show History", variant="secondary")
                 history_display = gr.Markdown(
-                    value="No conversation history yet.",
                     label="Conversation Log"
                 )
@@ -463,21 +455,27 @@ def create_interface():
             outputs=[history_display]
         )
-        # Usage Instructions
         gr.HTML("""
         <div style="margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 10px; border-left: 5px solid #007bff;">
             <h3>💡 How to Use Maya AI:</h3>
             <ol>
-                <li><strong>Start Call:</strong> Click "📞 Start Call" to begin your conversation</li>
                 <li><strong>Record:</strong> Use the microphone to record your message</li>
                 <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
                 <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
-                <li><strong>Continue:</strong> Keep the conversation going (up to 5 exchanges)</li>
                 <li><strong>End:</strong> Click "📞❌ End Call" when finished</li>
             </ol>
-            <h4>🎭 Emotional Features:</h4>
-            <p>Maya automatically detects your emotions and responds accordingly with natural expressions, breathing pauses, and contextual understanding!</p>
         </div>
         """)
@@ -485,6 +483,13 @@ def create_interface():
 if __name__ == "__main__":
     print("🚀 Initializing Maya AI System...")
     if load_models():
         print("✅ All models loaded successfully!")
@@ -495,7 +500,8 @@ if __name__ == "__main__":
             server_name="0.0.0.0",
             server_port=7860,
             share=True,
-            show_error=True
         )
     else:
-        print("❌ Failed to load models. Please check your setup.")

 import librosa
 import soundfile as sf
 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import warnings
 import json
 import time
 from datetime import datetime
 import os
+# Import TTS with fallback
 try:
+    from TTS.api import TTS
+    TTS_AVAILABLE = True
 except ImportError:
+    print("⚠️ TTS not available, using text-only mode")
+    TTS_AVAILABLE = False
 warnings.filterwarnings("ignore")
 # Global models
+asr_pipe = None
 qwen_model = None
 qwen_tokenizer = None
+tts_model = None
 conversation_history = []
 class ConversationManager:
             "emotion": emotion
         })
         if len(self.history) > self.max_exchanges:
             self.history = self.history[-self.max_exchanges:]
     def get_context(self):
         context = ""
+        for exchange in self.history[-3:]:
             context += f"User: {exchange['user']}\nAI: {exchange['ai']}\n"
         return context
         self.current_emotion = "neutral"
 def load_models():
+    """Load all models with proper error handling"""
+    global asr_pipe, qwen_model, qwen_tokenizer, tts_model
+    print("🚀 Loading models...")
+    # Load ASR model
+    print("🎤 Loading Whisper for ASR...")
     try:
+        asr_pipe = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-base",
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device=0 if torch.cuda.is_available() else -1
         )
+        print("✅ Whisper ASR loaded successfully!")
     except Exception as e:
+        print(f"❌ Error loading Whisper: {e}")
         return False
+    # Load Qwen model
     print("🧠 Loading Qwen2.5-1.5B for conversation...")
     try:
+        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
         qwen_tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
             trust_remote_code=True
         )
         qwen_model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto" if torch.cuda.is_available() else None,
             trust_remote_code=True
         )
         print("✅ Qwen loaded successfully!")
         print(f"❌ Error loading Qwen: {e}")
         return False
+    # Load TTS model
+    print("🎙️ Loading TTS model...")
+    if TTS_AVAILABLE:
+        try:
+            # Use Coqui TTS with a good female voice
+            tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
+            if torch.cuda.is_available():
+                tts_model = tts_model.to("cuda")
+            print("✅ TTS loaded successfully!")
+        except Exception as e:
+            print(f"⚠️ TTS failed to load: {e}")
+            tts_model = None
+    else:
+        print("⚠️ TTS not available, using text-only mode")
+        tts_model = None
     return True
+def detect_emotion_from_text(text):
+    """Simple emotion detection from text"""
+    text_lower = text.lower()
+    # Emotion keywords
+    if any(word in text_lower for word in ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing', 'fantastic']):
+        return 'happy'
+    elif any(word in text_lower for word in ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed']):
+        return 'sad'
+    elif any(word in text_lower for word in ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate']):
+        return 'angry'
+    elif any(word in text_lower for word in ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking']):
+        return 'surprised'
+    else:
+        return 'neutral'
 def speech_to_text_with_emotion(audio_input):
     """Convert speech to text and detect emotion"""
         if audio_input is None:
             return "", "neutral"
+        # Process audio input
         if isinstance(audio_input, tuple):
             sample_rate, audio_data = audio_input
+            # Convert to float32 and handle stereo
+            if audio_data.dtype != np.float32:
+                audio_data = audio_data.astype(np.float32)
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=1)
         else:
             audio_data = audio_input
             sample_rate = 16000
+        # Normalize audio
+        if len(audio_data) > 0:
+            max_val = np.max(np.abs(audio_data))
+            if max_val > 0:
+                audio_data = audio_data / max_val
         # Resample to 16kHz if needed
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+        # Speech to text
+        result = asr_pipe(audio_data, sampling_rate=16000)
+        transcription = result['text'].strip()
+        # Detect emotion from transcription
+        emotion = detect_emotion_from_text(transcription)
         return transcription, emotion
     try:
         context = conversation_manager.get_context()
+        # Emotional response styles
         emotional_prompts = {
+            "happy": "Respond with enthusiasm and joy. Use positive language and show excitement.",
+            "sad": "Respond with empathy and comfort. Be gentle, understanding, and supportive.",
+            "angry": "Respond calmly and try to help. Be patient and de-escalate the situation.",
+            "surprised": "Share in the surprise and show curiosity. Be engaging and interested.",
+            "neutral": "Respond naturally and conversationally. Be helpful and friendly."
         }
         system_prompt = f"""You are Maya, a friendly and emotionally intelligent AI assistant.
         Current user emotion: {emotion}
         Guidelines:
+        - Keep responses concise (1-2 sentences maximum)
+        - Match the user's emotional tone appropriately
         - Be natural and conversational
+        - Show empathy and understanding
+        - Provide helpful responses
         """
         messages = [
             add_generation_prompt=True
         )
+        model_inputs = qwen_tokenizer([text], return_tensors="pt")
+        if torch.cuda.is_available():
+            model_inputs = model_inputs.to(qwen_model.device)
         with torch.no_grad():
             generated_ids = qwen_model.generate(
                 model_inputs.input_ids,
+                max_new_tokens=80,
                 do_sample=True,
                 temperature=0.7,
+                top_p=0.9,
                 pad_token_id=qwen_tokenizer.eos_token_id
             )
     except Exception as e:
         print(f"Error in response generation: {e}")
+        return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
+def text_to_speech_emotional(text, emotion="neutral"):
+    """Convert text to speech with emotional context"""
     try:
+        if tts_model is None:
+            print(f"🔊 Maya says ({emotion}): {text}")
+            return None
         # Clear GPU cache
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        # Add emotional context to text
+        emotional_prefixes = {
+            "happy": "[Speaking with joy] ",
+            "sad": "[Speaking gently] ",
+            "angry": "[Speaking calmly] ",
+            "surprised": "[Speaking with excitement] ",
             "neutral": ""
         }
+        enhanced_text = f"{emotional_prefixes.get(emotion, '')}{text}"
+        print(f"Generating TTS for: {enhanced_text}")
         # Generate audio
+        audio_output = tts_model.tts(text=enhanced_text)
+        # Convert to numpy array if needed
+        if isinstance(audio_output, list):
+            audio_output = np.array(audio_output, dtype=np.float32)
+        elif torch.is_tensor(audio_output):
+            audio_output = audio_output.cpu().numpy().astype(np.float32)
         # Normalize audio
         if len(audio_output) > 0:
             if max_val > 1.0:
                 audio_output = audio_output / max_val * 0.95
+        return (22050, audio_output)  # Return sample rate and audio data
     except Exception as e:
         print(f"Error in TTS: {e}")
+        print(f"🔊 Maya says ({emotion}): {text}")
         return None
 # Initialize conversation manager
     greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
     greeting_audio = text_to_speech_emotional(greeting_text, "happy")
+    return greeting_audio, greeting_text, "Call started! 📞 Ready to chat!"
 def process_conversation(audio_input):
     """Main conversation processing pipeline"""
     if audio_input is None:
+        return None, "Please record some audio first.", "", "❌ No audio input received."
     try:
         # Step 1: Speech to Text + Emotion Detection
         user_text, emotion = speech_to_text_with_emotion(audio_input)
         if not user_text or user_text.strip() == "":
+            return None, "I didn't catch that. Could you please repeat?", "", "❌ No speech detected."
         # Step 2: Generate contextual response
         ai_response = generate_contextual_response(user_text, emotion, conv_manager)
         # Step 4: Update conversation history
         conv_manager.add_exchange(user_text, ai_response, emotion)
+        status = f"✅ Processed successfully! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
         return response_audio, ai_response, user_text, status
 def get_conversation_history():
     """Return formatted conversation history"""
     if not conv_manager.history:
+        return "No conversation history yet. Start a call to begin chatting!"
     history_text = "📋 **Conversation History:**\n\n"
     for i, exchange in enumerate(conv_manager.history, 1):
 def end_call():
     """End call and clear conversation"""
+    farewell_text = "Thank you for talking with me! Have a wonderful day!"
     farewell_audio = text_to_speech_emotional(farewell_text, "happy")
     conv_manager.clear()
+    return farewell_audio, farewell_text, "Call ended. 📞❌ Thanks for chatting!"
 def create_interface():
+    """Create the Gradio interface"""
     with gr.Blocks(
+        title="Maya AI - Speech-to-Speech Assistant",
         theme=gr.themes.Soft(),
         css="""
+        .main-header {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            border-radius: 15px;
+            padding: 20px;
+            text-align: center;
+            margin-bottom: 20px;
+        }
         .call-button { background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important; }
+        .process-button { background: linear-gradient(45deg, #45B7D1, #96CEB4) !important; }
         .end-button { background: linear-gradient(45deg, #FFA07A, #FF6347) !important; }
         """
     ) as demo:
         gr.HTML("""
+        <div class="main-header">
             <h1 style="color: white; margin: 0; font-size: 2.5em;">🎙️ Maya AI</h1>
             <p style="color: white; margin: 10px 0; font-size: 1.2em;">Advanced Speech-to-Speech Conversational AI</p>
             <p style="color: #E8E8E8; margin: 0;">Natural • Emotional • Contextual</p>
                 audio_input = gr.Audio(
                     label="Record Your Message",
                     sources=["microphone"],
+                    type="numpy"
                 )
+                process_btn = gr.Button("🎯 Process Message", elem_classes="process-button", variant="primary", size="lg")
+                # Status Display
                 status_display = gr.Textbox(
                     label="📊 Status",
                     interactive=False,
+                    lines=2,
+                    value="Ready to start! Click 'Start Call' to begin."
                 )
             with gr.Column(scale=2):
                         user_text_display = gr.Textbox(
                             label="👤 What You Said",
                             interactive=False,
+                            lines=3,
+                            placeholder="Your speech will appear here..."
                         )
                     with gr.Column():
                         ai_text_display = gr.Textbox(
                             label="🤖 Maya's Response",
                             interactive=False,
+                            lines=3,
+                            placeholder="Maya's response will appear here..."
                         )
+        # Conversation History Section
         with gr.Row():
             with gr.Column():
                 gr.HTML("<h3>📋 Conversation History</h3>")
                 history_btn = gr.Button("📋 Show History", variant="secondary")
                 history_display = gr.Markdown(
+                    value="No conversation history yet. Start a call to begin chatting!",
                     label="Conversation Log"
                 )
             outputs=[history_display]
         )
+        # Instructions
         gr.HTML("""
         <div style="margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 10px; border-left: 5px solid #007bff;">
             <h3>💡 How to Use Maya AI:</h3>
             <ol>
+                <li><strong>Start Call:</strong> Click "📞 Start Call" to initialize Maya</li>
                 <li><strong>Record:</strong> Use the microphone to record your message</li>
                 <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
                 <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
+                <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
                 <li><strong>End:</strong> Click "📞❌ End Call" when finished</li>
             </ol>
+            <h4>🎭 Features:</h4>
+            <ul>
+                <li>🎤 <strong>Speech Recognition:</strong> Powered by Whisper</li>
+                <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B</li>
+                <li>🎭 <strong>Emotion Detection:</strong> Automatic emotion recognition</li>
+                <li>🔊 <strong>Natural Speech:</strong> High-quality TTS with emotions</li>
+                <li>💭 <strong>Context Memory:</strong> Remembers conversation flow</li>
+            </ul>
         </div>
         """)
 if __name__ == "__main__":
     print("🚀 Initializing Maya AI System...")
+    print("🔧 Checking GPU availability...")
+    if torch.cuda.is_available():
+        print(f"✅ GPU detected: {torch.cuda.get_device_name()}")
+        print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+    else:
+        print("⚠️ No GPU detected, using CPU")
     if load_models():
         print("✅ All models loaded successfully!")
             server_name="0.0.0.0",
             server_port=7860,
             share=True,
+            show_error=True,
+            debug=False
         )
     else:
+        print("❌ Failed to load models. Please check the logs above for details.")