Spaces:

Devakumar868
/

my-voice-assistant

Paused

App Files Files Community

Devakumar868 commited on Jun 23

Commit

e4a1156

verified ·

1 Parent(s): 43f8b09

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -57

app.py CHANGED Viewed

@@ -83,7 +83,7 @@ def check_system_info():
         print("⚠️ CUDA not available, using CPU")
 def load_models():
-    """Load all models with enhanced memory management"""
     global asr_pipe, qwen_model, qwen_tokenizer, tts_model, tts_type
     print("🚀 Loading Maya AI models...")
@@ -104,7 +104,7 @@ def load_models():
         print(f"❌ Error loading Whisper: {e}")
         return False
-    # Load Qwen model with memory optimization
     print("🧠 Loading Qwen2.5-1.5B for conversation...")
     try:
         model_name = "Qwen/Qwen2.5-1.5B-Instruct"
@@ -116,9 +116,7 @@ def load_models():
             model_name,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="auto" if torch.cuda.is_available() else None,
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-            max_memory={0: "6GB"} if torch.cuda.is_available() else None  # Limit Qwen memory
         )
         print("✅ Qwen loaded successfully!")
         optimize_gpu_memory()
@@ -126,18 +124,19 @@ def load_models():
         print(f"❌ Error loading Qwen: {e}")
         return False
-    # Load Dia TTS with optimized settings
     if DIA_AVAILABLE:
         try:
-            print("Attempting to load Dia TTS with optimized settings...")
             # Clear memory before loading Dia
             optimize_gpu_memory()
             tts_model = Dia.from_pretrained(
                 "nari-labs/Dia-1.6B",
-                compute_dtype="float16" if torch.cuda.is_available() else "float32",
-                low_cpu_mem_usage=True
             )
             # Move to GPU if available
@@ -227,7 +226,7 @@ def speech_to_text_with_emotion(audio_input):
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         print("🔄 Running Whisper ASR...")
-        result = asr_pipe(audio_data, language='en')  # Force English to avoid language detection
         transcription = result['text'].strip()
         print(f"Transcription: '{transcription}'")
@@ -247,7 +246,6 @@ def speech_to_text_with_emotion(audio_input):
 def generate_contextual_response(user_input, emotion, conversation_manager):
     """Enhanced response generation with memory optimization"""
     try:
-        # Clear GPU cache before generation
         optimize_gpu_memory()
         context = conversation_manager.get_context()
@@ -290,13 +288,13 @@ Guidelines:
         with torch.no_grad():
             generated_ids = qwen_model.generate(
                 model_inputs.input_ids,
-                max_new_tokens=50,  # Reduced for shorter responses
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.9,
                 repetition_penalty=1.1,
                 pad_token_id=qwen_tokenizer.eos_token_id,
-                attention_mask=model_inputs.attention_mask  # Fix attention mask warning
             )
         generated_ids = [
@@ -309,7 +307,6 @@ Guidelines:
         if response.startswith("Maya:"):
             response = response[5:].strip()
-        # Clear cache after generation
         optimize_gpu_memory()
         return response
@@ -319,32 +316,18 @@ Guidelines:
         return "I'm sorry, I'm having trouble processing that right now."
 def text_to_speech_emotional(text, emotion="neutral"):
-    """FIXED TTS with enhanced Dia configuration and memory management"""
     try:
         if tts_model is None:
             print(f"🔊 Maya says ({emotion}): {text}")
             return None
-        # Aggressive memory cleanup before TTS
         optimize_gpu_memory()
         if tts_type == "dia":
-            # Simplified emotional markers for better audio quality
-            emotional_markers = {
-                "happy": "",  # Remove complex markers that might cause artifacts
-                "sad": "",
-                "angry": "",
-                "surprised": "",
-                "neutral": ""
-            }
-            # Simplified text processing for Dia - NO COMPLEX MARKERS
-            # Keep it simple to avoid audio artifacts
             enhanced_text = f"[S1] {text}"
-            # Remove pauses that might cause artifacts
-            # enhanced_text = enhanced_text.replace("(pause)", "")
             # Limit text length to prevent memory issues
             if len(enhanced_text) > 200:
                 enhanced_text = enhanced_text[:200] + "..."
@@ -353,14 +336,10 @@ def text_to_speech_emotional(text, emotion="neutral"):
             try:
                 with torch.no_grad():
-                    # Use more conservative settings for T4
                     audio_output = tts_model.generate(
                         enhanced_text,
                         use_torch_compile=False,
-                        verbose=False,
-                        # Add these parameters for better quality
-                        temperature=0.7,
-                        top_p=0.9
                     )
                 # Enhanced audio processing
@@ -371,30 +350,28 @@ def text_to_speech_emotional(text, emotion="neutral"):
                 if len(audio_output.shape) > 1:
                     audio_output = audio_output.squeeze()
-                # More conservative normalization
                 if len(audio_output) > 0:
                     # Remove DC offset
                     audio_output = audio_output - np.mean(audio_output)
-                    # Gentle normalization to prevent clipping
                     max_val = np.max(np.abs(audio_output))
                     if max_val > 0:
-                        audio_output = audio_output / max_val * 0.8  # More conservative scaling
                 # Ensure correct data type
                 audio_output = audio_output.astype(np.float32)
                 # Validate audio output
                 if np.any(np.isnan(audio_output)) or np.any(np.isinf(audio_output)):
-                    print("❌ Audio contains NaN or Inf values, regenerating...")
                     return None
                 print(f"✅ Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
-                # Clear memory after generation
                 optimize_gpu_memory()
-                # Return audio with correct sample rate for Dia
                 return (44100, audio_output)
             except Exception as e:
@@ -420,14 +397,14 @@ def start_call():
     conv_manager.clear()
     optimize_gpu_memory()
-    greeting_text = "Hello! I'm Maya. How can I help you today?"  # Shorter greeting
     greeting_audio = text_to_speech_emotional(greeting_text, "happy")
     tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
     return greeting_audio, greeting_text, f"📞 Call started! Maya is ready. {tts_status}"
 def process_conversation(audio_input):
-    """Main conversation processing pipeline with memory management"""
     if audio_input is None:
         return None, "Please record some audio first.", "", "❌ No audio input received."
@@ -494,7 +471,7 @@ def end_call():
     return farewell_audio, farewell_text, "📞❌ Call ended. Thank you!"
 def create_interface():
-    """Create Gradio interface with enhanced audio settings"""
     with gr.Blocks(
         title="Maya AI - Speech-to-Speech Assistant",
         theme=gr.themes.Soft()
@@ -532,18 +509,12 @@ def create_interface():
             with gr.Column(scale=2):
                 gr.HTML("<h3>🔊 Maya's Response</h3>")
-                # Enhanced audio component with better settings
                 response_audio = gr.Audio(
                     label="Maya's Voice Response",
                     type="numpy",
                     interactive=False,
                     autoplay=True,
-                    show_download_button=True,
-                    show_share_button=False,
-                    waveform_options=gr.WaveformOptions(
-                        waveform_color="#01C6FF",
-                        waveform_progress_color="#0066CC"
-                    )
                 )
                 with gr.Row():
@@ -589,7 +560,7 @@ def create_interface():
             outputs=[history_display]
         )
-        # Enhanced instructions
         gr.HTML("""
         <div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
             <h3>💡 How to Use Maya AI:</h3>
@@ -603,12 +574,12 @@ def create_interface():
             </ol>
             <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
-                <p><strong>🔧 Troubleshooting Audio Issues:</strong></p>
                 <ul>
-                    <li>If audio sounds weird, try refreshing the page</li>
-                    <li>Use the download button to save and test audio files</li>
-                    <li>Speak in a quiet environment for best results</li>
-                    <li>Keep responses short for better audio quality</li>
                 </ul>
             </div>
         </div>

         print("⚠️ CUDA not available, using CPU")
 def load_models():
+    """Load all models with FIXED Dia loading"""
     global asr_pipe, qwen_model, qwen_tokenizer, tts_model, tts_type
     print("🚀 Loading Maya AI models...")
         print(f"❌ Error loading Whisper: {e}")
         return False
+    # Load Qwen model
     print("🧠 Loading Qwen2.5-1.5B for conversation...")
     try:
         model_name = "Qwen/Qwen2.5-1.5B-Instruct"
             model_name,
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             device_map="auto" if torch.cuda.is_available() else None,
+            trust_remote_code=True
         )
         print("✅ Qwen loaded successfully!")
         optimize_gpu_memory()
         print(f"❌ Error loading Qwen: {e}")
         return False
+    # FIXED: Load Dia TTS without unsupported parameters
     if DIA_AVAILABLE:
         try:
+            print("Attempting to load Dia TTS with FIXED parameters...")
             # Clear memory before loading Dia
             optimize_gpu_memory()
+            # FIXED: Remove unsupported parameters
             tts_model = Dia.from_pretrained(
                 "nari-labs/Dia-1.6B",
+                compute_dtype="float16" if torch.cuda.is_available() else "float32"
+                # Removed: low_cpu_mem_usage=True (not supported by Dia)
             )
             # Move to GPU if available
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         print("🔄 Running Whisper ASR...")
+        result = asr_pipe(audio_data, language='en')  # Force English
         transcription = result['text'].strip()
         print(f"Transcription: '{transcription}'")
 def generate_contextual_response(user_input, emotion, conversation_manager):
     """Enhanced response generation with memory optimization"""
     try:
         optimize_gpu_memory()
         context = conversation_manager.get_context()
         with torch.no_grad():
             generated_ids = qwen_model.generate(
                 model_inputs.input_ids,
+                max_new_tokens=50,
                 do_sample=True,
                 temperature=0.7,
                 top_p=0.9,
                 repetition_penalty=1.1,
                 pad_token_id=qwen_tokenizer.eos_token_id,
+                attention_mask=model_inputs.attention_mask
             )
         generated_ids = [
         if response.startswith("Maya:"):
             response = response[5:].strip()
         optimize_gpu_memory()
         return response
         return "I'm sorry, I'm having trouble processing that right now."
 def text_to_speech_emotional(text, emotion="neutral"):
+    """FIXED TTS with proper Dia configuration"""
     try:
         if tts_model is None:
             print(f"🔊 Maya says ({emotion}): {text}")
             return None
         optimize_gpu_memory()
         if tts_type == "dia":
+            # Simplified text processing for Dia
             enhanced_text = f"[S1] {text}"
             # Limit text length to prevent memory issues
             if len(enhanced_text) > 200:
                 enhanced_text = enhanced_text[:200] + "..."
             try:
                 with torch.no_grad():
                     audio_output = tts_model.generate(
                         enhanced_text,
                         use_torch_compile=False,
+                        verbose=False
                     )
                 # Enhanced audio processing
                 if len(audio_output.shape) > 1:
                     audio_output = audio_output.squeeze()
+                # Conservative normalization
                 if len(audio_output) > 0:
                     # Remove DC offset
                     audio_output = audio_output - np.mean(audio_output)
+                    # Gentle normalization
                     max_val = np.max(np.abs(audio_output))
                     if max_val > 0:
+                        audio_output = audio_output / max_val * 0.8
                 # Ensure correct data type
                 audio_output = audio_output.astype(np.float32)
                 # Validate audio output
                 if np.any(np.isnan(audio_output)) or np.any(np.isinf(audio_output)):
+                    print("❌ Audio contains NaN or Inf values")
                     return None
                 print(f"✅ Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
                 optimize_gpu_memory()
                 return (44100, audio_output)
             except Exception as e:
     conv_manager.clear()
     optimize_gpu_memory()
+    greeting_text = "Hello! I'm Maya. How can I help you today?"
     greeting_audio = text_to_speech_emotional(greeting_text, "happy")
     tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
     return greeting_audio, greeting_text, f"📞 Call started! Maya is ready. {tts_status}"
 def process_conversation(audio_input):
+    """Main conversation processing pipeline"""
     if audio_input is None:
         return None, "Please record some audio first.", "", "❌ No audio input received."
     return farewell_audio, farewell_text, "📞❌ Call ended. Thank you!"
 def create_interface():
+    """Create Gradio interface"""
     with gr.Blocks(
         title="Maya AI - Speech-to-Speech Assistant",
         theme=gr.themes.Soft()
             with gr.Column(scale=2):
                 gr.HTML("<h3>🔊 Maya's Response</h3>")
                 response_audio = gr.Audio(
                     label="Maya's Voice Response",
                     type="numpy",
                     interactive=False,
                     autoplay=True,
+                    show_download_button=True
                 )
                 with gr.Row():
             outputs=[history_display]
         )
+        # Instructions
         gr.HTML("""
         <div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
             <h3>💡 How to Use Maya AI:</h3>
             </ol>
             <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
+                <p><strong>🔧 Fixed Issues:</strong></p>
                 <ul>
+                    <li>✅ Pydantic version pinned to 2.10.6 (fixes Gradio crash)</li>
+                    <li>✅ Dia TTS loading parameters corrected</li>
+                    <li>✅ Memory optimization for T4 GPU</li>
+                    <li>✅ Audio processing enhanced</li>
                 </ul>
             </div>
         </div>