Spaces:

Devakumar868
/

my-voice-assistant

Paused

App Files Files Community

Devakumar868 commited on Jun 22

Commit

dbc05eb

verified ·

1 Parent(s): 8dbae03

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -68

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import time
 from datetime import datetime
 import os
 import sys
 # Import with enhanced error handling
 try:
@@ -56,6 +57,13 @@ class ConversationManager:
         self.history = []
         self.current_emotion = "neutral"
 def check_system_info():
     """Check system capabilities"""
     print("🔍 System Information:")
@@ -66,14 +74,20 @@ def check_system_info():
         print(f"✅ CUDA: {torch.cuda.get_device_name()}")
         print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
         print(f"🔥 CUDA Version: {torch.version.cuda}")
     else:
         print("⚠️ CUDA not available, using CPU")
 def load_models():
-    """Load all models with enhanced error handling"""
     global asr_pipe, qwen_model, qwen_tokenizer, tts_model, tts_type
     print("🚀 Loading Maya AI models...")
     # Load ASR model (Whisper)
     print("🎤 Loading Whisper for ASR...")
@@ -85,11 +99,12 @@ def load_models():
             device=0 if torch.cuda.is_available() else -1
         )
         print("✅ Whisper ASR loaded successfully!")
     except Exception as e:
         print(f"❌ Error loading Whisper: {e}")
         return False
-    # Load Qwen model
     print("🧠 Loading Qwen2.5-1.5B for conversation...")
     try:
         model_name = "Qwen/Qwen2.5-1.5B-Instruct"
@@ -102,23 +117,36 @@ def load_models():
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="auto" if torch.cuda.is_available() else None,
             trust_remote_code=True,
-            low_cpu_mem_usage=True
         )
         print("✅ Qwen loaded successfully!")
     except Exception as e:
         print(f"❌ Error loading Qwen: {e}")
         return False
-    # Load Dia TTS
     if DIA_AVAILABLE:
         try:
-            print("Attempting to load Dia TTS...")
             tts_model = Dia.from_pretrained(
                 "nari-labs/Dia-1.6B",
-                compute_dtype="float16" if torch.cuda.is_available() else "float32"
             )
             tts_type = "dia"
             print("✅ Dia TTS loaded successfully!")
             return True
         except Exception as e:
             print(f"⚠️ Dia TTS failed to load: {e}")
@@ -199,7 +227,7 @@ def speech_to_text_with_emotion(audio_input):
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         print("🔄 Running Whisper ASR...")
-        result = asr_pipe(audio_data)
         transcription = result['text'].strip()
         print(f"Transcription: '{transcription}'")
@@ -217,8 +245,11 @@ def speech_to_text_with_emotion(audio_input):
         return "Sorry, I couldn't understand that. Please try again.", "neutral"
 def generate_contextual_response(user_input, emotion, conversation_manager):
-    """Enhanced response generation"""
     try:
         context = conversation_manager.get_context()
         emotional_prompts = {
@@ -237,7 +268,7 @@ Previous context: {context}
 User emotion: {emotion}
 Guidelines:
-- Keep responses concise (1-2 sentences)
 - Be natural and conversational
 - Show empathy and understanding
 - Provide helpful responses
@@ -259,12 +290,13 @@ Guidelines:
         with torch.no_grad():
             generated_ids = qwen_model.generate(
                 model_inputs.input_ids,
-                max_new_tokens=100,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.9,
                 repetition_penalty=1.1,
-                pad_token_id=qwen_tokenizer.eos_token_id
             )
         generated_ids = [
@@ -277,6 +309,9 @@ Guidelines:
         if response.startswith("Maya:"):
             response = response[5:].strip()
         return response
     except Exception as e:
@@ -284,64 +319,88 @@ Guidelines:
         return "I'm sorry, I'm having trouble processing that right now."
 def text_to_speech_emotional(text, emotion="neutral"):
-    """FIXED TTS with proper audio format for Gradio"""
     try:
         if tts_model is None:
             print(f"🔊 Maya says ({emotion}): {text}")
             return None
-        # Clear GPU cache
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
         if tts_type == "dia":
             emotional_markers = {
-                "happy": "(excited) ",
-                "sad": "(sad) ",
-                "angry": "(calm) ",
-                "surprised": "(surprised) ",
                 "neutral": ""
             }
-            # Enhanced text for Dia
-            enhanced_text = f"[S1] {emotional_markers.get(emotion, '')}{text}"
-            # Add pauses for natural speech
-            if len(text) > 50:
-                enhanced_text = enhanced_text.replace(". ", ". (pause) ")
-                enhanced_text = enhanced_text.replace("! ", "! (pause) ")
-                enhanced_text = enhanced_text.replace("? ", "? (pause) ")
-            print(f"Generating Dia TTS for: {enhanced_text}")
-            with torch.no_grad():
-                audio_output = tts_model.generate(
-                    enhanced_text,
-                    use_torch_compile=False,
-                    verbose=False
-                )
-            # FIXED: Proper audio processing for Gradio
-            if isinstance(audio_output, torch.Tensor):
-                audio_output = audio_output.cpu().numpy()
-            # Ensure audio is in the right format
-            if len(audio_output.shape) > 1:
-                audio_output = audio_output.squeeze()
-            # Normalize audio properly
-            if len(audio_output) > 0:
-                max_val = np.max(np.abs(audio_output))
-                if max_val > 0:
-                    audio_output = audio_output / max_val * 0.95
-            # CRITICAL FIX: Ensure audio is float32 and in correct range
-            audio_output = audio_output.astype(np.float32)
-            print(f"✅ Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
-            # Return in format Gradio expects: (sample_rate, audio_array)
-            return (44100, audio_output)
         else:
             print(f"🔊 Maya says ({emotion}): {text}")
@@ -349,6 +408,7 @@ def text_to_speech_emotional(text, emotion="neutral"):
     except Exception as e:
         print(f"❌ Error in TTS: {e}")
         print(f"🔊 Maya says ({emotion}): {text}")
         return None
@@ -358,19 +418,22 @@ conv_manager = ConversationManager()
 def start_call():
     """Initialize call and return greeting"""
     conv_manager.clear()
-    greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
     greeting_audio = text_to_speech_emotional(greeting_text, "happy")
     tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
     return greeting_audio, greeting_text, f"📞 Call started! Maya is ready. {tts_status}"
 def process_conversation(audio_input):
-    """Main conversation processing pipeline"""
     if audio_input is None:
         return None, "Please record some audio first.", "", "❌ No audio input received."
     try:
         print("🔄 Processing conversation...")
         # STT + Emotion Detection
         user_text, emotion = speech_to_text_with_emotion(audio_input)
@@ -392,13 +455,19 @@ def process_conversation(audio_input):
         # Update history
         conv_manager.add_exchange(user_text, ai_response, emotion)
-        status = f"✅ Success! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5 | TTS: {tts_type.upper()}"
         return response_audio, ai_response, user_text, status
     except Exception as e:
         error_msg = f"❌ Error: {str(e)}"
         print(error_msg)
         return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
 def get_conversation_history():
@@ -416,15 +485,16 @@ def get_conversation_history():
     return history_text
 def end_call():
-    """End call"""
     farewell_text = "Thank you for talking with me! Have a wonderful day!"
     farewell_audio = text_to_speech_emotional(farewell_text, "happy")
     conv_manager.clear()
     return farewell_audio, farewell_text, "📞❌ Call ended. Thank you!"
 def create_interface():
-    """Create Gradio interface with FIXED audio components"""
     with gr.Blocks(
         title="Maya AI - Speech-to-Speech Assistant",
         theme=gr.themes.Soft()
@@ -462,14 +532,18 @@ def create_interface():
             with gr.Column(scale=2):
                 gr.HTML("<h3>🔊 Maya's Response</h3>")
-                # FIXED: Audio component with proper settings
                 response_audio = gr.Audio(
                     label="Maya's Voice Response",
                     type="numpy",
                     interactive=False,
-                    autoplay=True,  # Enable autoplay
                     show_download_button=True,
-                    show_share_button=False
                 )
                 with gr.Row():
@@ -515,7 +589,7 @@ def create_interface():
             outputs=[history_display]
         )
-        # Instructions
         gr.HTML("""
         <div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
             <h3>💡 How to Use Maya AI:</h3>
@@ -529,12 +603,12 @@ def create_interface():
             </ol>
             <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
-                <p><strong>💡 Pro Tips:</strong></p>
                 <ul>
-                    <li>Speak clearly and close to your microphone</li>
-                    <li>Record for at least 2-3 seconds</li>
-                    <li>Use a quiet environment for best results</li>
-                    <li>Maya detects emotions and responds accordingly!</li>
                 </ul>
             </div>
         </div>

 from datetime import datetime
 import os
 import sys
+import gc
 # Import with enhanced error handling
 try:
         self.history = []
         self.current_emotion = "neutral"
+def optimize_gpu_memory():
+    """Optimize GPU memory usage"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        gc.collect()
 def check_system_info():
     """Check system capabilities"""
     print("🔍 System Information:")
         print(f"✅ CUDA: {torch.cuda.get_device_name()}")
         print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
         print(f"🔥 CUDA Version: {torch.version.cuda}")
+        # Check current memory usage
+        allocated = torch.cuda.memory_allocated() / 1e9
+        cached = torch.cuda.memory_reserved() / 1e9
+        print(f"📊 Current GPU Usage: {allocated:.1f}GB allocated, {cached:.1f}GB cached")
     else:
         print("⚠️ CUDA not available, using CPU")
 def load_models():
+    """Load all models with enhanced memory management"""
     global asr_pipe, qwen_model, qwen_tokenizer, tts_model, tts_type
     print("🚀 Loading Maya AI models...")
+    optimize_gpu_memory()
     # Load ASR model (Whisper)
     print("🎤 Loading Whisper for ASR...")
             device=0 if torch.cuda.is_available() else -1
         )
         print("✅ Whisper ASR loaded successfully!")
+        optimize_gpu_memory()
     except Exception as e:
         print(f"❌ Error loading Whisper: {e}")
         return False
+    # Load Qwen model with memory optimization
     print("🧠 Loading Qwen2.5-1.5B for conversation...")
     try:
         model_name = "Qwen/Qwen2.5-1.5B-Instruct"
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="auto" if torch.cuda.is_available() else None,
             trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            max_memory={0: "6GB"} if torch.cuda.is_available() else None  # Limit Qwen memory
         )
         print("✅ Qwen loaded successfully!")
+        optimize_gpu_memory()
     except Exception as e:
         print(f"❌ Error loading Qwen: {e}")
         return False
+    # Load Dia TTS with optimized settings
     if DIA_AVAILABLE:
         try:
+            print("Attempting to load Dia TTS with optimized settings...")
+            # Clear memory before loading Dia
+            optimize_gpu_memory()
             tts_model = Dia.from_pretrained(
                 "nari-labs/Dia-1.6B",
+                compute_dtype="float16" if torch.cuda.is_available() else "float32",
+                low_cpu_mem_usage=True
             )
+            # Move to GPU if available
+            if torch.cuda.is_available():
+                tts_model = tts_model.cuda()
             tts_type = "dia"
             print("✅ Dia TTS loaded successfully!")
+            optimize_gpu_memory()
             return True
         except Exception as e:
             print(f"⚠️ Dia TTS failed to load: {e}")
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         print("🔄 Running Whisper ASR...")
+        result = asr_pipe(audio_data, language='en')  # Force English to avoid language detection
         transcription = result['text'].strip()
         print(f"Transcription: '{transcription}'")
         return "Sorry, I couldn't understand that. Please try again.", "neutral"
 def generate_contextual_response(user_input, emotion, conversation_manager):
+    """Enhanced response generation with memory optimization"""
     try:
+        # Clear GPU cache before generation
+        optimize_gpu_memory()
         context = conversation_manager.get_context()
         emotional_prompts = {
 User emotion: {emotion}
 Guidelines:
+- Keep responses very concise (1 sentence maximum)
 - Be natural and conversational
 - Show empathy and understanding
 - Provide helpful responses
         with torch.no_grad():
             generated_ids = qwen_model.generate(
                 model_inputs.input_ids,
+                max_new_tokens=50,  # Reduced for shorter responses
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.9,
                 repetition_penalty=1.1,
+                pad_token_id=qwen_tokenizer.eos_token_id,
+                attention_mask=model_inputs.attention_mask  # Fix attention mask warning
             )
         generated_ids = [
         if response.startswith("Maya:"):
             response = response[5:].strip()
+        # Clear cache after generation
+        optimize_gpu_memory()
         return response
     except Exception as e:
         return "I'm sorry, I'm having trouble processing that right now."
 def text_to_speech_emotional(text, emotion="neutral"):
+    """FIXED TTS with enhanced Dia configuration and memory management"""
     try:
         if tts_model is None:
             print(f"🔊 Maya says ({emotion}): {text}")
             return None
+        # Aggressive memory cleanup before TTS
+        optimize_gpu_memory()
         if tts_type == "dia":
+            # Simplified emotional markers for better audio quality
             emotional_markers = {
+                "happy": "",  # Remove complex markers that might cause artifacts
+                "sad": "",
+                "angry": "",
+                "surprised": "",
                 "neutral": ""
             }
+            # Simplified text processing for Dia - NO COMPLEX MARKERS
+            # Keep it simple to avoid audio artifacts
+            enhanced_text = f"[S1] {text}"
+            # Remove pauses that might cause artifacts
+            # enhanced_text = enhanced_text.replace("(pause)", "")
+            # Limit text length to prevent memory issues
+            if len(enhanced_text) > 200:
+                enhanced_text = enhanced_text[:200] + "..."
+            print(f"Generating Dia TTS for: {enhanced_text}")
+            try:
+                with torch.no_grad():
+                    # Use more conservative settings for T4
+                    audio_output = tts_model.generate(
+                        enhanced_text,
+                        use_torch_compile=False,
+                        verbose=False,
+                        # Add these parameters for better quality
+                        temperature=0.7,
+                        top_p=0.9
+                    )
+                # Enhanced audio processing
+                if isinstance(audio_output, torch.Tensor):
+                    audio_output = audio_output.cpu().numpy()
+                # Ensure proper audio format
+                if len(audio_output.shape) > 1:
+                    audio_output = audio_output.squeeze()
+                # More conservative normalization
+                if len(audio_output) > 0:
+                    # Remove DC offset
+                    audio_output = audio_output - np.mean(audio_output)
+                    # Gentle normalization to prevent clipping
+                    max_val = np.max(np.abs(audio_output))
+                    if max_val > 0:
+                        audio_output = audio_output / max_val * 0.8  # More conservative scaling
+                # Ensure correct data type
+                audio_output = audio_output.astype(np.float32)
+                # Validate audio output
+                if np.any(np.isnan(audio_output)) or np.any(np.isinf(audio_output)):
+                    print("❌ Audio contains NaN or Inf values, regenerating...")
+                    return None
+                print(f"✅ Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
+                # Clear memory after generation
+                optimize_gpu_memory()
+                # Return audio with correct sample rate for Dia
+                return (44100, audio_output)
+            except Exception as e:
+                print(f"❌ Error in Dia generation: {e}")
+                optimize_gpu_memory()
+                return None
         else:
             print(f"🔊 Maya says ({emotion}): {text}")
     except Exception as e:
         print(f"❌ Error in TTS: {e}")
+        optimize_gpu_memory()
         print(f"🔊 Maya says ({emotion}): {text}")
         return None
 def start_call():
     """Initialize call and return greeting"""
     conv_manager.clear()
+    optimize_gpu_memory()
+    greeting_text = "Hello! I'm Maya. How can I help you today?"  # Shorter greeting
     greeting_audio = text_to_speech_emotional(greeting_text, "happy")
     tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
     return greeting_audio, greeting_text, f"📞 Call started! Maya is ready. {tts_status}"
 def process_conversation(audio_input):
+    """Main conversation processing pipeline with memory management"""
     if audio_input is None:
         return None, "Please record some audio first.", "", "❌ No audio input received."
     try:
         print("🔄 Processing conversation...")
+        optimize_gpu_memory()
         # STT + Emotion Detection
         user_text, emotion = speech_to_text_with_emotion(audio_input)
         # Update history
         conv_manager.add_exchange(user_text, ai_response, emotion)
+        # Memory status
+        if torch.cuda.is_available():
+            allocated = torch.cuda.memory_allocated() / 1e9
+            status = f"✅ Success! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5 | GPU: {allocated:.1f}GB"
+        else:
+            status = f"✅ Success! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
         return response_audio, ai_response, user_text, status
     except Exception as e:
         error_msg = f"❌ Error: {str(e)}"
         print(error_msg)
+        optimize_gpu_memory()
         return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
 def get_conversation_history():
     return history_text
 def end_call():
+    """End call with memory cleanup"""
     farewell_text = "Thank you for talking with me! Have a wonderful day!"
     farewell_audio = text_to_speech_emotional(farewell_text, "happy")
     conv_manager.clear()
+    optimize_gpu_memory()
     return farewell_audio, farewell_text, "📞❌ Call ended. Thank you!"
 def create_interface():
+    """Create Gradio interface with enhanced audio settings"""
     with gr.Blocks(
         title="Maya AI - Speech-to-Speech Assistant",
         theme=gr.themes.Soft()
             with gr.Column(scale=2):
                 gr.HTML("<h3>🔊 Maya's Response</h3>")
+                # Enhanced audio component with better settings
                 response_audio = gr.Audio(
                     label="Maya's Voice Response",
                     type="numpy",
                     interactive=False,
+                    autoplay=True,
                     show_download_button=True,
+                    show_share_button=False,
+                    waveform_options=gr.WaveformOptions(
+                        waveform_color="#01C6FF",
+                        waveform_progress_color="#0066CC"
+                    )
                 )
                 with gr.Row():
             outputs=[history_display]
         )
+        # Enhanced instructions
         gr.HTML("""
         <div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
             <h3>💡 How to Use Maya AI:</h3>
             </ol>
             <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
+                <p><strong>🔧 Troubleshooting Audio Issues:</strong></p>
                 <ul>
+                    <li>If audio sounds weird, try refreshing the page</li>
+                    <li>Use the download button to save and test audio files</li>
+                    <li>Speak in a quiet environment for best results</li>
+                    <li>Keep responses short for better audio quality</li>
                 </ul>
             </div>
         </div>