Spaces:

Devakumar868
/

my-voice-assistant

Paused

App Files Files Community

Devakumar868 commited on Jun 22

Commit

a5f8a58

verified ·

1 Parent(s): ef9cdda

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -88

app.py CHANGED Viewed

@@ -20,15 +20,6 @@ except ImportError as e:
     print(f"⚠️ Dia TTS not available: {e}")
     DIA_AVAILABLE = False
-# Fallback TTS import
-try:
-    from TTS.api import TTS
-    COQUI_TTS_AVAILABLE = True
-    print("✅ Coqui TTS library available as fallback")
-except ImportError:
-    COQUI_TTS_AVAILABLE = False
-    print("⚠️ Coqui TTS not available")
 warnings.filterwarnings("ignore")
 # Global models
@@ -36,7 +27,7 @@ asr_pipe = None
 qwen_model = None
 qwen_tokenizer = None
 tts_model = None
-tts_type = None  # Track which TTS model is loaded
 class ConversationManager:
     def __init__(self, max_exchanges=5):
@@ -84,15 +75,15 @@ def load_models():
     print("🚀 Loading Maya AI models...")
-    # Load ASR model (Whisper)
     print("🎤 Loading Whisper for ASR...")
     try:
         asr_pipe = pipeline(
             "automatic-speech-recognition",
             model="openai/whisper-base",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device=0 if torch.cuda.is_available() else -1,
-            return_timestamps=False
         )
         print("✅ Whisper ASR loaded successfully!")
     except Exception as e:
@@ -119,10 +110,7 @@ def load_models():
         print(f"❌ Error loading Qwen: {e}")
         return False
-    # Load TTS model with priority: Dia > Coqui > Text-only
-    print("🎙️ Loading TTS model...")
-    # Try Dia TTS first (preferred)
     if DIA_AVAILABLE:
         try:
             print("Attempting to load Dia TTS...")
@@ -137,20 +125,6 @@ def load_models():
             print(f"⚠️ Dia TTS failed to load: {e}")
             tts_model = None
-    # Fallback to Coqui TTS
-    if COQUI_TTS_AVAILABLE:
-        try:
-            print("Attempting to load Coqui TTS as fallback...")
-            tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
-            if torch.cuda.is_available():
-                tts_model = tts_model.to("cuda")
-            tts_type = "coqui"
-            print("✅ Coqui TTS loaded successfully!")
-            return True
-        except Exception as e:
-            print(f"⚠️ Coqui TTS failed to load: {e}")
-            tts_model = None
     # Continue without TTS (text-only mode)
     print("⚠️ No TTS available, running in text-only mode")
     tts_type = "none"
@@ -192,14 +166,17 @@ def detect_emotion_from_text(text):
     return 'neutral'
 def speech_to_text_with_emotion(audio_input):
-    """Enhanced STT with better audio processing"""
     try:
         if audio_input is None:
             return "", "neutral"
-        # Process audio input with better handling
         if isinstance(audio_input, tuple):
             sample_rate, audio_data = audio_input
             # Handle different audio formats
             if audio_data.dtype == np.int16:
@@ -218,33 +195,43 @@ def speech_to_text_with_emotion(audio_input):
         # Validate audio length
         if len(audio_data) < 1600:  # Less than 0.1 seconds at 16kHz
-            return "Audio too short, please speak longer", "neutral"
         # Normalize audio
-        if len(audio_data) > 0:
-            max_val = np.max(np.abs(audio_data))
-            if max_val > 0:
-                audio_data = audio_data / max_val * 0.95
-        # Resample to 16kHz if needed
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
-        # Speech to text with Whisper
-        result = asr_pipe(audio_data, sampling_rate=16000)
         transcription = result['text'].strip()
-        if not transcription:
-            return "No speech detected", "neutral"
         # Detect emotion from transcription
         emotion = detect_emotion_from_text(transcription)
         return transcription, emotion
     except Exception as e:
-        print(f"Error in STT: {e}")
-        return "Sorry, I couldn't understand that.", "neutral"
 def generate_contextual_response(user_input, emotion, conversation_manager):
     """Enhanced response generation with better emotional intelligence"""
@@ -327,7 +314,7 @@ Guidelines:
         return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
 def text_to_speech_emotional(text, emotion="neutral"):
-    """Enhanced TTS with support for both Dia and Coqui"""
     try:
         if tts_model is None:
             print(f"🔊 Maya says ({emotion}): {text}")
@@ -378,38 +365,6 @@ def text_to_speech_emotional(text, emotion="neutral"):
                     audio_output = audio_output / max_val * 0.95
             return (44100, audio_output)
-        elif tts_type == "coqui":
-            # Coqui TTS processing
-            emotional_prefixes = {
-                "happy": "[Speaking with joy] ",
-                "sad": "[Speaking gently] ",
-                "angry": "[Speaking calmly] ",
-                "surprised": "[Speaking with excitement] ",
-                "fearful": "[Speaking reassuringly] ",
-                "disgusted": "[Speaking understandingly] ",
-                "neutral": ""
-            }
-            enhanced_text = f"{emotional_prefixes.get(emotion, '')}{text}"
-            print(f"Generating Coqui TTS for: {enhanced_text}")
-            audio_output = tts_model.tts(text=enhanced_text)
-            # Convert to numpy array if needed
-            if isinstance(audio_output, list):
-                audio_output = np.array(audio_output, dtype=np.float32)
-            elif torch.is_tensor(audio_output):
-                audio_output = audio_output.cpu().numpy().astype(np.float32)
-            # Normalize audio
-            if len(audio_output) > 0:
-                max_val = np.max(np.abs(audio_output))
-                if max_val > 1.0:
-                    audio_output = audio_output / max_val * 0.95
-            return (22050, audio_output)
         else:
             # Text-only mode
@@ -439,11 +394,18 @@ def process_conversation(audio_input):
         return None, "Please record some audio first.", "", "❌ No audio input received."
     try:
         # Step 1: Speech to Text + Emotion Detection
         user_text, emotion = speech_to_text_with_emotion(audio_input)
-        if not user_text or user_text.strip() == "" or "sorry" in user_text.lower():
-            return None, "I didn't catch that clearly. Could you please speak a bit louder or closer to the microphone?", "", "❌ No clear speech detected."
         # Step 2: Generate contextual response
         ai_response = generate_contextual_response(user_text, emotion, conv_manager)
@@ -535,7 +497,7 @@ def create_interface():
                 # Audio Input
                 gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>🎤 Voice Input</h3>")
                 audio_input = gr.Audio(
-                    label="Record Your Message",
                     sources=["microphone"],
                     type="numpy",
                     format="wav"
@@ -620,7 +582,7 @@ def create_interface():
                     <h4 style="color: #007bff;">🚀 Getting Started:</h4>
                     <ol style="color: #495057;">
                         <li><strong>Start Call:</strong> Click "📞 Start Call" to initialize Maya</li>
-                        <li><strong>Record:</strong> Use the microphone to record your message</li>
                         <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
                         <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
                         <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
@@ -628,20 +590,26 @@ def create_interface():
                     </ol>
                 </div>
                 <div>
-                    <h4 style="color: #28a745;">🎭 Advanced Features:</h4>
                     <ul style="color: #495057;">
                         <li>🎤 <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
                         <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
-                        <li>🎭 <strong>Emotion Detection:</strong> Advanced emotion recognition from speech</li>
-                        <li>🔊 <strong>Natural TTS:</strong> High-quality speech synthesis with Dia TTS</li>
-                        <li>💭 <strong>Context Memory:</strong> Remembers conversation flow and context</li>
-                        <li>❤️ <strong>Emotional Intelligence:</strong> Responds appropriately to your emotions</li>
                     </ul>
                 </div>
             </div>
             <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px; border-left: 4px solid #bee5eb;">
-                <p style="margin: 0; color: #0c5460;"><strong>💡 Pro Tip:</strong> Speak clearly and naturally. Maya can detect emotions like happiness, sadness, anger, surprise, fear, and disgust, and will respond accordingly to provide the best conversational experience!</p>
             </div>
         </div>
         """)

     print(f"⚠️ Dia TTS not available: {e}")
     DIA_AVAILABLE = False
 warnings.filterwarnings("ignore")
 # Global models
 qwen_model = None
 qwen_tokenizer = None
 tts_model = None
+tts_type = None
 class ConversationManager:
     def __init__(self, max_exchanges=5):
     print("🚀 Loading Maya AI models...")
+    # Load ASR model (Whisper) - FIXED VERSION
     print("🎤 Loading Whisper for ASR...")
     try:
         asr_pipe = pipeline(
             "automatic-speech-recognition",
             model="openai/whisper-base",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device=0 if torch.cuda.is_available() else -1
+            # Removed return_timestamps and other problematic parameters
         )
         print("✅ Whisper ASR loaded successfully!")
     except Exception as e:
         print(f"❌ Error loading Qwen: {e}")
         return False
+    # Load Dia TTS
     if DIA_AVAILABLE:
         try:
             print("Attempting to load Dia TTS...")
             print(f"⚠️ Dia TTS failed to load: {e}")
             tts_model = None
     # Continue without TTS (text-only mode)
     print("⚠️ No TTS available, running in text-only mode")
     tts_type = "none"
     return 'neutral'
 def speech_to_text_with_emotion(audio_input):
+    """FIXED STT function with proper audio processing"""
     try:
         if audio_input is None:
             return "", "neutral"
+        print("🎤 Processing audio input...")
+        # Process audio input with enhanced handling
         if isinstance(audio_input, tuple):
             sample_rate, audio_data = audio_input
+            print(f"Audio input: sample_rate={sample_rate}, shape={audio_data.shape}, dtype={audio_data.dtype}")
             # Handle different audio formats
             if audio_data.dtype == np.int16:
         # Validate audio length
         if len(audio_data) < 1600:  # Less than 0.1 seconds at 16kHz
+            return "Audio too short, please speak for at least 1 second", "neutral"
+        # Check for silence (audio with very low amplitude)
+        max_amplitude = np.max(np.abs(audio_data))
+        if max_amplitude < 0.01:  # Very quiet audio
+            return "Audio too quiet, please speak louder", "neutral"
         # Normalize audio
+        if max_amplitude > 0:
+            audio_data = audio_data / max_amplitude * 0.95
+        # Resample to 16kHz if needed (Whisper expects 16kHz)
         if sample_rate != 16000:
+            print(f"Resampling from {sample_rate}Hz to 16000Hz...")
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+        print(f"Final audio: length={len(audio_data)}, max_amplitude={np.max(np.abs(audio_data)):.3f}")
+        # FIXED: Call ASR pipeline without sampling_rate parameter
+        print("🔄 Running Whisper ASR...")
+        result = asr_pipe(audio_data)  # Removed sampling_rate parameter
         transcription = result['text'].strip()
+        print(f"Transcription: '{transcription}'")
+        if not transcription or len(transcription) < 2:
+            return "No clear speech detected, please try speaking more clearly", "neutral"
         # Detect emotion from transcription
         emotion = detect_emotion_from_text(transcription)
+        print(f"Detected emotion: {emotion}")
         return transcription, emotion
     except Exception as e:
+        print(f"❌ Error in STT: {e}")
+        return "Sorry, I couldn't understand that. Please try again.", "neutral"
 def generate_contextual_response(user_input, emotion, conversation_manager):
     """Enhanced response generation with better emotional intelligence"""
         return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
 def text_to_speech_emotional(text, emotion="neutral"):
+    """Enhanced TTS with Dia support"""
     try:
         if tts_model is None:
             print(f"🔊 Maya says ({emotion}): {text}")
                     audio_output = audio_output / max_val * 0.95
             return (44100, audio_output)
         else:
             # Text-only mode
         return None, "Please record some audio first.", "", "❌ No audio input received."
     try:
+        print("🔄 Processing conversation...")
         # Step 1: Speech to Text + Emotion Detection
         user_text, emotion = speech_to_text_with_emotion(audio_input)
+        # Check for error messages from STT
+        error_phrases = ["audio too short", "audio too quiet", "no clear speech", "sorry", "couldn't understand"]
+        if any(phrase in user_text.lower() for phrase in error_phrases):
+            return None, user_text, "", f"❌ STT Issue: {user_text}"
+        if not user_text or user_text.strip() == "":
+            return None, "I didn't catch that clearly. Could you please speak a bit louder and closer to the microphone?", "", "❌ No speech detected."
         # Step 2: Generate contextual response
         ai_response = generate_contextual_response(user_text, emotion, conv_manager)
                 # Audio Input
                 gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>🎤 Voice Input</h3>")
                 audio_input = gr.Audio(
+                    label="Record Your Message (Speak clearly for at least 2 seconds)",
                     sources=["microphone"],
                     type="numpy",
                     format="wav"
                     <h4 style="color: #007bff;">🚀 Getting Started:</h4>
                     <ol style="color: #495057;">
                         <li><strong>Start Call:</strong> Click "📞 Start Call" to initialize Maya</li>
+                        <li><strong>Record:</strong> Speak clearly for at least 2 seconds</li>
                         <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
                         <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
                         <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
                     </ol>
                 </div>
                 <div>
+                    <h4 style="color: #28a745;">🎭 Features:</h4>
                     <ul style="color: #495057;">
                         <li>🎤 <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
                         <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
+                        <li>🎭 <strong>Emotion Detection:</strong> Advanced emotion recognition</li>
+                        <li>🔊 <strong>Natural TTS:</strong> High-quality Dia TTS synthesis</li>
+                        <li>💭 <strong>Context Memory:</strong> Remembers conversation flow</li>
+                        <li>❤️ <strong>Emotional Intelligence:</strong> Responds to your emotions</li>
                     </ul>
                 </div>
             </div>
             <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px; border-left: 4px solid #bee5eb;">
+                <p style="margin: 0; color: #0c5460;"><strong>💡 Pro Tips:</strong></p>
+                <ul style="color: #0c5460; margin: 10px 0;">
+                    <li>Speak clearly and close to your microphone</li>
+                    <li>Record for at least 2-3 seconds</li>
+                    <li>Speak in a quiet environment for best results</li>
+                    <li>Maya can detect emotions and respond accordingly!</li>
+                </ul>
             </div>
         </div>
         """)