Spaces:

Devakumar868
/

my-voice-assistant

Paused

App Files Files Community

Devakumar868 commited on Jun 22

Commit

7ffc610

verified ·

1 Parent(s): f86c823

Update app.py

Browse files

Files changed (1) hide show

app.py +441 -244

app.py CHANGED Viewed

@@ -1,291 +1,488 @@
-import os, torch, numpy as np, soundfile as sf, gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
-import nemo.collections.asr as nemo_asr
-from TTS.api import TTS
-from sklearn.linear_model import LogisticRegression
-from datasets import load_dataset
-import tempfile
-import gc
-# Configuration
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-SEED = 42; SAMPLE_RATE = 22050; TEMPERATURE = 0.7
-torch.manual_seed(SEED); np.random.seed(SEED)
-print(f"🚀 System Info:")
-print(f"Device: {DEVICE}")
-print(f"NumPy: {np.__version__}")
-print(f"PyTorch: {torch.__version__}")
-if torch.cuda.is_available():
-    print(f"CUDA: {torch.version.cuda}")
-class ConversationalAI:
-    def __init__(self):
-        print("🔄 Initializing Conversational AI...")
-        self.setup_models()
-        print("✅ All models loaded successfully!")
-    def setup_models(self):
-        # 1. ASR: Parakeet RNNT
-        print("📢 Loading ASR model...")
-        try:
-            self.asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
-                "nvidia/parakeet-rnnt-1.1b"
-            ).to(DEVICE).eval()
-            print("✅ Parakeet ASR loaded")
-        except Exception as e:
-            print(f"⚠️ Parakeet failed: {e}")
-            print("🔄 Loading Whisper fallback...")
-            self.asr_pipeline = pipeline(
-                "automatic-speech-recognition",
-                model="openai/whisper-base.en",
-                device=0 if DEVICE == "cuda" else -1
-            )
-            print("✅ Whisper ASR loaded")
-        # 2. SER: Emotion classifier (simplified for demo)
-        print("🎭 Setting up emotion recognition...")
-        X_demo = np.random.rand(100, 128)
-        y_demo = np.random.randint(0, 5, 100)  # 5 emotions: neutral, happy, sad, angry, surprised
-        self.ser_clf = LogisticRegression().fit(X_demo, y_demo)
-        self.emotion_labels = ["neutral", "happy", "sad", "angry", "surprised"]
-        print("✅ SER model ready")
-        # 3. LLM: Conversational model
-        print("🧠 Loading LLM...")
-        bnb_cfg = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4"
-        )
-        model_name = "microsoft/DialoGPT-medium"
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.tokenizer.pad_token = self.tokenizer.eos_token
-        self.llm_model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            quantization_config=bnb_cfg,
-            device_map="auto",
             torch_dtype=torch.float16,
-            low_cpu_mem_usage=True
         )
-        print("✅ LLM loaded")
-        # 4. TTS: Text-to-Speech
-        print("🗣️ Loading TTS...")
-        try:
-            self.tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
-            print("✅ TTS loaded")
-        except Exception as e:
-            print(f"⚠️ TTS error: {e}")
-            self.tts = None
-        # Memory cleanup
-        if DEVICE == "cuda":
-            torch.cuda.empty_cache()
-            gc.collect()
-    def transcribe(self, audio):
-        """Convert speech to text"""
-        try:
-            if hasattr(self, 'asr_model'):
-                # Use Parakeet
-                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-                sf.write(temp_file.name, audio[1], audio[0])
-                transcription = self.asr_model.transcribe([temp_file.name])[0]
-                os.unlink(temp_file.name)
-                return transcription.text if hasattr(transcription, 'text') else str(transcription)
-            else:
-                # Use Whisper
-                return self.asr_pipeline({"sampling_rate": audio[0], "raw": audio[1]})["text"]
-        except Exception as e:
-            print(f"ASR Error: {e}")
-            return "Sorry, I couldn't understand the audio."
-    def predict_emotion(self):
-        """Predict emotion from audio (simplified demo)"""
-        emotion_idx = self.ser_clf.predict(np.random.rand(1, 128))[0]
-        return self.emotion_labels[emotion_idx]
-    def generate_response(self, text, emotion):
-        """Generate conversational response"""
-        try:
-            # Create emotion-aware prompt
-            prompt = f"Human: {text}\nAssistant (feeling {emotion}):"
-            inputs = self.tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True).to(DEVICE)
-            with torch.no_grad():
-                outputs = self.llm_model.generate(
-                    inputs,
-                    max_length=inputs.shape[1] + 100,
-                    temperature=TEMPERATURE,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.eos_token_id,
-                    no_repeat_ngram_size=2,
-                    top_p=0.9
-                )
-            response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
-            response = response.split("Human:")[0].strip()
-            return response if response else "I understand. Please tell me more."
-        except Exception as e:
-            print(f"LLM Error: {e}")
-            return "I'm having trouble processing that. Could you please rephrase?"
-    def synthesize(self, text):
-        """Convert text to speech"""
-        try:
-            if self.tts:
-                wav = self.tts.tts(text=text)
-                if isinstance(wav, list):
-                    wav = np.array(wav, dtype=np.float32)
-                # Normalize audio
-                wav = wav / np.max(np.abs(wav)) if np.max(np.abs(wav)) > 0 else wav
-                return (SAMPLE_RATE, (wav * 32767).astype(np.int16))
-            else:
-                # Return silence if TTS fails
-                return (SAMPLE_RATE, np.zeros(SAMPLE_RATE, dtype=np.int16))
-        except Exception as e:
-            print(f"TTS Error: {e}")
-            return (SAMPLE_RATE, np.zeros(SAMPLE_RATE, dtype=np.int16))
-    def process_conversation(self, audio_input, chat_history):
-        """Main pipeline: Speech -> Emotion -> LLM -> Speech"""
-        if audio_input is None:
-            return chat_history, None, ""
-        try:
-            # Step 1: Speech to Text
-            user_text = self.transcribe(audio_input)
-            if not user_text.strip():
-                return chat_history, None, "No speech detected."
-            # Step 2: Emotion Recognition
-            emotion = self.predict_emotion()
-            # Step 3: Generate Response
-            ai_response = self.generate_response(user_text, emotion)
-            # Step 4: Text to Speech
-            audio_response = self.synthesize(ai_response)
-            # Update chat history
-            chat_history.append([user_text, ai_response])
-            # Memory cleanup
-            if DEVICE == "cuda":
-                torch.cuda.empty_cache()
-                gc.collect()
-            return chat_history, audio_response, f"You said: {user_text} (detected emotion: {emotion})"
-        except Exception as e:
-            error_msg = f"Error processing conversation: {e}"
-            print(error_msg)
-            return chat_history, None, error_msg
-# Initialize AI system
-print("🚀 Starting Conversational AI...")
-ai_system = ConversationalAI()
-# Gradio Interface
 def create_interface():
     with gr.Blocks(
-        title="Emotion-Aware Conversational AI",
-        theme=gr.themes.Soft()
     ) as demo:
         gr.HTML("""
-            <div style="text-align: center; margin-bottom: 2rem;">
-                <h1>🤖 Emotion-Aware Conversational AI</h1>
-                <p>Speak naturally and get intelligent responses with emotion recognition</p>
-            </div>
         """)
         with gr.Row():
-            with gr.Column(scale=2):
-                chatbot = gr.Chatbot(
-                    label="Conversation History",
-                    height=400,
-                    show_copy_button=True
-                )
                 audio_input = gr.Audio(
-                    label="🎤 Speak to AI",
                     sources=["microphone"],
                     type="numpy",
-                    format="wav"
                 )
-                with gr.Row():
-                    submit_btn = gr.Button("💬 Process Speech", variant="primary", scale=2)
-                    clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary", scale=1)
-            with gr.Column(scale=1):
-                audio_output = gr.Audio(
-                    label="🔊 AI Response",
-                    type="numpy",
-                    autoplay=True
-                )
                 status_display = gr.Textbox(
                     label="📊 Status",
-                    lines=3,
                     interactive=False
                 )
-                gr.HTML(f"""
-                    <div style="padding: 1rem; background: #f0f9ff; border-radius: 0.5rem;">
-                        <h3>🔧 System Info</h3>
-                        <p><strong>Device:</strong> {DEVICE.upper()}</p>
-                        <p><strong>PyTorch:</strong> {torch.__version__}</p>
-                        <p><strong>Models:</strong> Parakeet + DialoGPT + TTS</p>
-                        <p><strong>Features:</strong> Emotion Recognition</p>
-                    </div>
-                """)
-        def process_audio(audio, history):
-            return ai_system.process_conversation(audio, history)
-        def clear_conversation():
-            if DEVICE == "cuda":
-                torch.cuda.empty_cache()
-                gc.collect()
-            return [], None, "Conversation cleared."
-        # Event handlers
-        submit_btn.click(
-            fn=process_audio,
-            inputs=[audio_input, chatbot],
-            outputs=[chatbot, audio_output, status_display]
         )
-        clear_btn.click(
-            fn=clear_conversation,
-            outputs=[chatbot, audio_output, status_display]
         )
-        audio_input.change(
-            fn=process_audio,
-            inputs=[audio_input, chatbot],
-            outputs=[chatbot, audio_output, status_display]
         )
     return demo
-# Launch application
 if __name__ == "__main__":
-    print("🌟 Creating interface...")
-    demo = create_interface()
-    print("🚀 Launching application...")
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True,
-        show_error=True
-    )

+import gradio as gr
+import torch
+import numpy as np
+import librosa
+import soundfile as sf
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+from dia.model import Dia
+import warnings
+import json
+import time
+from datetime import datetime
+import os
+warnings.filterwarnings("ignore")
+# Global models
+ultravox_pipe = None
+qwen_model = None
+qwen_tokenizer = None
+dia_model = None
+conversation_history = []
+class ConversationManager:
+    def __init__(self, max_exchanges=5):
+        self.history = []
+        self.max_exchanges = max_exchanges
+        self.current_emotion = "neutral"
+    def add_exchange(self, user_input, ai_response, emotion="neutral"):
+        self.history.append({
+            "timestamp": datetime.now().isoformat(),
+            "user": user_input,
+            "ai": ai_response,
+            "emotion": emotion
+        })
+        # Keep only last max_exchanges
+        if len(self.history) > self.max_exchanges:
+            self.history = self.history[-self.max_exchanges:]
+    def get_context(self):
+        context = ""
+        for exchange in self.history[-3:]:  # Last 3 exchanges for context
+            context += f"User: {exchange['user']}\nAI: {exchange['ai']}\n"
+        return context
+    def clear(self):
+        self.history = []
+        self.current_emotion = "neutral"
+def load_models():
+    """Load all models with optimized memory usage"""
+    global ultravox_pipe, qwen_model, qwen_tokenizer, dia_model
+    print("🚀 Loading Ultravox for ASR + Emotion Recognition...")
+    try:
+        ultravox_pipe = pipeline(
+            model='fixie-ai/ultravox-v0_4',
+            trust_remote_code=True,
             torch_dtype=torch.float16,
+            device_map="auto"
         )
+        print("✅ Ultravox loaded successfully!")
+    except Exception as e:
+        print(f"❌ Error loading Ultravox: {e}")
+        return False
+    print("🧠 Loading Qwen2.5-1.5B for conversation...")
+    try:
+        qwen_tokenizer = AutoTokenizer.from_pretrained(
+            "Qwen/Qwen2.5-1.5B-Instruct",
+            trust_remote_code=True
+        )
+        qwen_model = AutoModelForCausalLM.from_pretrained(
+            "Qwen/Qwen2.5-1.5B-Instruct",
+            torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        print("✅ Qwen loaded successfully!")
+    except Exception as e:
+        print(f"❌ Error loading Qwen: {e}")
+        return False
+    print("🎙️ Loading Enhanced Dia TTS...")
+    try:
+        dia_model = Dia.from_pretrained(
+            "nari-labs/Dia-1.6B",
+            compute_dtype="float16"
+        )
+        print("✅ Dia TTS loaded successfully!")
+    except Exception as e:
+        print(f"❌ Error loading Dia: {e}")
+        return False
+    return True
+def detect_emotion_from_speech(audio_input):
+    """Extract emotion from speech using Ultravox understanding"""
+    try:
+        # Emotional keywords mapping
+        emotion_keywords = {
+            "happy": ["laugh", "excited", "joy", "great", "awesome", "wonderful"],
+            "sad": ["cry", "upset", "disappointed", "sorry", "terrible"],
+            "angry": ["mad", "furious", "annoyed", "frustrated"],
+            "surprised": ["wow", "amazing", "incredible", "unbelievable"],
+            "neutral": []
+        }
+        # Use Ultravox to understand speech context
+        turns = [
+            {"role": "system", "content": "Analyze the emotional tone of the user's speech. Respond with just the emotion: happy, sad, angry, surprised, or neutral."},
+        ]
+        result = ultravox_pipe({
+            'audio': audio_input,
+            'turns': turns,
+            'sampling_rate': 16000
+        }, max_new_tokens=10)
+        detected_emotion = result[0]['generated_text'].lower().strip()
+        # Validate emotion
+        valid_emotions = ["happy", "sad", "angry", "surprised", "neutral"]
+        if detected_emotion not in valid_emotions:
+            detected_emotion = "neutral"
+        return detected_emotion
+    except:
+        return "neutral"
+def speech_to_text_with_emotion(audio_input):
+    """Convert speech to text and detect emotion"""
+    try:
+        if audio_input is None:
+            return "", "neutral"
+        # Convert audio format if needed
+        if isinstance(audio_input, tuple):
+            sample_rate, audio_data = audio_input
+            audio_data = audio_data.astype(np.float32)
+            if len(audio_data.shape) > 1:
+                audio_data = audio_data.mean(axis=1)
+        else:
+            audio_data = audio_input
+            sample_rate = 16000
+        # Resample to 16kHz if needed
+        if sample_rate != 16000:
+            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+        # Speech to text using Ultravox
+        turns = [
+            {"role": "system", "content": "Transcribe the user's speech accurately. Only provide the transcription."},
+        ]
+        result = ultravox_pipe({
+            'audio': audio_data,
+            'turns': turns,
+            'sampling_rate': 16000
+        }, max_new_tokens=100)
+        transcription = result[0]['generated_text'].strip()
+        # Detect emotion
+        emotion = detect_emotion_from_speech(audio_data)
+        return transcription, emotion
+    except Exception as e:
+        print(f"Error in STT: {e}")
+        return "Sorry, I couldn't understand that.", "neutral"
+def generate_contextual_response(user_input, emotion, conversation_manager):
+    """Generate contextual response using Qwen"""
+    try:
+        context = conversation_manager.get_context()
+        # Emotional system prompt
+        emotional_prompts = {
+            "happy": "Respond with enthusiasm and joy. Use exclamations and positive language.",
+            "sad": "Respond with empathy and comfort. Be gentle and understanding.",
+            "angry": "Respond calmly and try to de-escalate. Be patient and helpful.",
+            "surprised": "Share in the surprise and excitement. Be engaging and curious.",
+            "neutral": "Respond naturally and conversationally."
+        }
+        system_prompt = f"""You are Maya, a friendly and emotionally intelligent AI assistant.
+        {emotional_prompts.get(emotion, emotional_prompts['neutral'])}
+        Previous conversation context:
+        {context}
+        Current user emotion: {emotion}
+        Guidelines:
+        - Keep responses concise (1-2 sentences)
+        - Match the user's emotional tone
+        - Be natural and conversational
+        - Include emotional expressions when appropriate like (laughs), (sighs), etc.
+        """
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_input}
+        ]
+        # Generate response
+        text = qwen_tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        model_inputs = qwen_tokenizer([text], return_tensors="pt").to(qwen_model.device)
+        with torch.no_grad():
+            generated_ids = qwen_model.generate(
+                model_inputs.input_ids,
+                max_new_tokens=100,
+                do_sample=True,
+                temperature=0.7,
+                pad_token_id=qwen_tokenizer.eos_token_id
+            )
+        generated_ids = [
+            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+        ]
+        response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return response.strip()
+    except Exception as e:
+        print(f"Error in response generation: {e}")
+        return "I'm sorry, I'm having trouble processing that right now."
+def text_to_speech_emotional(text, emotion="neutral", speaker="S1"):
+    """Convert text to emotional speech using enhanced Dia"""
+    try:
+        # Clear GPU cache
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # Emotional markers for Dia
+        emotional_markers = {
+            "happy": "(excited) ",
+            "sad": "(sad) ",
+            "angry": "(frustrated) ",
+            "surprised": "(surprised) ",
+            "neutral": ""
+        }
+        # Add emotional context and natural pauses
+        enhanced_text = f"[{speaker}] {emotional_markers.get(emotion, '')}{text}"
+        # Add natural breathing pauses for longer text
+        if len(text) > 50:
+            enhanced_text = enhanced_text.replace(". ", ". (pause) ")
+            enhanced_text = enhanced_text.replace("! ", "! (pause) ")
+            enhanced_text = enhanced_text.replace("? ", "? (pause) ")
+        print(f"Generating TTS for: {enhanced_text[:100]}...")
+        # Generate audio
+        with torch.no_grad():
+            audio_output = dia_model.generate(
+                enhanced_text,
+                use_torch_compile=False,
+                verbose=False
+            )
+        # Process audio output
+        if isinstance(audio_output, torch.Tensor):
+            audio_output = audio_output.cpu().numpy()
+        # Normalize audio
+        if len(audio_output) > 0:
+            max_val = np.max(np.abs(audio_output))
+            if max_val > 1.0:
+                audio_output = audio_output / max_val * 0.95
+        return (44100, audio_output)
+    except Exception as e:
+        print(f"Error in TTS: {e}")
+        return None
+# Initialize conversation manager
+conv_manager = ConversationManager()
+def start_call():
+    """Initialize call and return greeting"""
+    conv_manager.clear()
+    greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
+    greeting_audio = text_to_speech_emotional(greeting_text, "happy")
+    return greeting_audio, greeting_text, "Call started! 📞"
+def process_conversation(audio_input):
+    """Main conversation processing pipeline"""
+    if audio_input is None:
+        return None, "Please record some audio first.", "", "No audio input received."
+    try:
+        # Step 1: Speech to Text + Emotion Detection
+        user_text, emotion = speech_to_text_with_emotion(audio_input)
+        if not user_text or user_text.strip() == "":
+            return None, "I didn't catch that. Could you please repeat?", "", "No speech detected."
+        # Step 2: Generate contextual response
+        ai_response = generate_contextual_response(user_text, emotion, conv_manager)
+        # Step 3: Convert to speech
+        response_audio = text_to_speech_emotional(ai_response, emotion)
+        # Step 4: Update conversation history
+        conv_manager.add_exchange(user_text, ai_response, emotion)
+        status = f"✅ Processed | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
+        return response_audio, ai_response, user_text, status
+    except Exception as e:
+        error_msg = f"❌ Error processing conversation: {str(e)}"
+        return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
+def get_conversation_history():
+    """Return formatted conversation history"""
+    if not conv_manager.history:
+        return "No conversation history yet."
+    history_text = "📋 **Conversation History:**\n\n"
+    for i, exchange in enumerate(conv_manager.history, 1):
+        timestamp = exchange['timestamp'][:19].replace('T', ' ')
+        history_text += f"**Exchange {i}** ({timestamp}) - Emotion: {exchange['emotion']}\n"
+        history_text += f"👤 **You:** {exchange['user']}\n"
+        history_text += f"🤖 **Maya:** {exchange['ai']}\n\n"
+    return history_text
+def end_call():
+    """End call and clear conversation"""
+    farewell_text = "Thank you for talking with me! Have a great day!"
+    farewell_audio = text_to_speech_emotional(farewell_text, "happy")
+    conv_manager.clear()
+    return farewell_audio, farewell_text, "Call ended. 📞❌"
+# Create Gradio Interface
 def create_interface():
     with gr.Blocks(
+        title="Maya AI - Advanced Speech-to-Speech Assistant",
+        theme=gr.themes.Soft(),
+        css="""
+        .call-button { background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important; }
+        .record-button { background: linear-gradient(45deg, #45B7D1, #96CEB4) !important; }
+        .end-button { background: linear-gradient(45deg, #FFA07A, #FF6347) !important; }
+        """
     ) as demo:
         gr.HTML("""
+        <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 20px;">
+            <h1 style="color: white; margin: 0; font-size: 2.5em;">🎙️ Maya AI</h1>
+            <p style="color: white; margin: 10px 0; font-size: 1.2em;">Advanced Speech-to-Speech Conversational AI</p>
+            <p style="color: #E8E8E8; margin: 0;">Natural • Emotional • Contextual</p>
+        </div>
         """)
         with gr.Row():
+            with gr.Column(scale=1):
+                # Call Controls
+                gr.HTML("<h3>📞 Call Controls</h3>")
+                start_btn = gr.Button("📞 Start Call", elem_classes="call-button", size="lg")
+                end_btn = gr.Button("📞❌ End Call", elem_classes="end-button", size="lg")
+                # Audio Input
+                gr.HTML("<h3>🎤 Voice Input</h3>")
                 audio_input = gr.Audio(
+                    label="Record Your Message",
                     sources=["microphone"],
                     type="numpy",
+                    elem_classes="record-button"
                 )
+                process_btn = gr.Button("🎯 Process Message", variant="primary", size="lg")
+                # Status
                 status_display = gr.Textbox(
                     label="📊 Status",
+                    interactive=False,
+                    lines=2
+                )
+            with gr.Column(scale=2):
+                # AI Response Audio
+                gr.HTML("<h3>🔊 Maya's Response</h3>")
+                response_audio = gr.Audio(
+                    label="Maya's Voice Response",
+                    type="numpy",
                     interactive=False
                 )
+                # Text Displays
+                with gr.Row():
+                    with gr.Column():
+                        user_text_display = gr.Textbox(
+                            label="👤 What You Said",
+                            interactive=False,
+                            lines=3
+                        )
+                    with gr.Column():
+                        ai_text_display = gr.Textbox(
+                            label="🤖 Maya's Response",
+                            interactive=False,
+                            lines=3
+                        )
+        # Conversation History
+        with gr.Row():
+            with gr.Column():
+                gr.HTML("<h3>📋 Conversation History</h3>")
+                history_btn = gr.Button("📋 Show History", variant="secondary")
+                history_display = gr.Markdown(
+                    value="No conversation history yet.",
+                    label="Conversation Log"
+                )
+        # Event Handlers
+        start_btn.click(
+            fn=start_call,
+            outputs=[response_audio, ai_text_display, status_display]
         )
+        process_btn.click(
+            fn=process_conversation,
+            inputs=[audio_input],
+            outputs=[response_audio, ai_text_display, user_text_display, status_display]
         )
+        end_btn.click(
+            fn=end_call,
+            outputs=[response_audio, ai_text_display, status_display]
         )
+        history_btn.click(
+            fn=get_conversation_history,
+            outputs=[history_display]
+        )
+        # Usage Instructions
+        gr.HTML("""
+        <div style="margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 10px; border-left: 5px solid #007bff;">
+            <h3>💡 How to Use Maya AI:</h3>
+            <ol>
+                <li><strong>Start Call:</strong> Click "📞 Start Call" to begin your conversation</li>
+                <li><strong>Record:</strong> Use the microphone to record your message</li>
+                <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
+                <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
+                <li><strong>Continue:</strong> Keep the conversation going (up to 5 exchanges)</li>
+                <li><strong>End:</strong> Click "📞❌ End Call" when finished</li>
+            </ol>
+            <h4>🎭 Emotional Features:</h4>
+            <p>Maya automatically detects your emotions and responds accordingly with natural expressions, breathing pauses, and contextual understanding!</p>
+        </div>
+        """)
     return demo
 if __name__ == "__main__":
+    print("🚀 Initializing Maya AI System...")
+    if load_models():
+        print("✅ All models loaded successfully!")
+        print("🌟 Launching Maya AI Interface...")
+        demo = create_interface()
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=True,
+            show_error=True
+        )
+    else:
+        print("❌ Failed to load models. Please check your setup.")