Devakumar868 commited on
Commit
a5f8a58
Β·
verified Β·
1 Parent(s): ef9cdda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -88
app.py CHANGED
@@ -20,15 +20,6 @@ except ImportError as e:
20
  print(f"⚠️ Dia TTS not available: {e}")
21
  DIA_AVAILABLE = False
22
 
23
- # Fallback TTS import
24
- try:
25
- from TTS.api import TTS
26
- COQUI_TTS_AVAILABLE = True
27
- print("βœ… Coqui TTS library available as fallback")
28
- except ImportError:
29
- COQUI_TTS_AVAILABLE = False
30
- print("⚠️ Coqui TTS not available")
31
-
32
  warnings.filterwarnings("ignore")
33
 
34
  # Global models
@@ -36,7 +27,7 @@ asr_pipe = None
36
  qwen_model = None
37
  qwen_tokenizer = None
38
  tts_model = None
39
- tts_type = None # Track which TTS model is loaded
40
 
41
  class ConversationManager:
42
  def __init__(self, max_exchanges=5):
@@ -84,15 +75,15 @@ def load_models():
84
 
85
  print("πŸš€ Loading Maya AI models...")
86
 
87
- # Load ASR model (Whisper)
88
  print("🎀 Loading Whisper for ASR...")
89
  try:
90
  asr_pipe = pipeline(
91
  "automatic-speech-recognition",
92
  model="openai/whisper-base",
93
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
94
- device=0 if torch.cuda.is_available() else -1,
95
- return_timestamps=False
96
  )
97
  print("βœ… Whisper ASR loaded successfully!")
98
  except Exception as e:
@@ -119,10 +110,7 @@ def load_models():
119
  print(f"❌ Error loading Qwen: {e}")
120
  return False
121
 
122
- # Load TTS model with priority: Dia > Coqui > Text-only
123
- print("πŸŽ™οΈ Loading TTS model...")
124
-
125
- # Try Dia TTS first (preferred)
126
  if DIA_AVAILABLE:
127
  try:
128
  print("Attempting to load Dia TTS...")
@@ -137,20 +125,6 @@ def load_models():
137
  print(f"⚠️ Dia TTS failed to load: {e}")
138
  tts_model = None
139
 
140
- # Fallback to Coqui TTS
141
- if COQUI_TTS_AVAILABLE:
142
- try:
143
- print("Attempting to load Coqui TTS as fallback...")
144
- tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
145
- if torch.cuda.is_available():
146
- tts_model = tts_model.to("cuda")
147
- tts_type = "coqui"
148
- print("βœ… Coqui TTS loaded successfully!")
149
- return True
150
- except Exception as e:
151
- print(f"⚠️ Coqui TTS failed to load: {e}")
152
- tts_model = None
153
-
154
  # Continue without TTS (text-only mode)
155
  print("⚠️ No TTS available, running in text-only mode")
156
  tts_type = "none"
@@ -192,14 +166,17 @@ def detect_emotion_from_text(text):
192
  return 'neutral'
193
 
194
  def speech_to_text_with_emotion(audio_input):
195
- """Enhanced STT with better audio processing"""
196
  try:
197
  if audio_input is None:
198
  return "", "neutral"
199
 
200
- # Process audio input with better handling
 
 
201
  if isinstance(audio_input, tuple):
202
  sample_rate, audio_data = audio_input
 
203
 
204
  # Handle different audio formats
205
  if audio_data.dtype == np.int16:
@@ -218,33 +195,43 @@ def speech_to_text_with_emotion(audio_input):
218
 
219
  # Validate audio length
220
  if len(audio_data) < 1600: # Less than 0.1 seconds at 16kHz
221
- return "Audio too short, please speak longer", "neutral"
 
 
 
 
 
222
 
223
  # Normalize audio
224
- if len(audio_data) > 0:
225
- max_val = np.max(np.abs(audio_data))
226
- if max_val > 0:
227
- audio_data = audio_data / max_val * 0.95
228
 
229
- # Resample to 16kHz if needed
230
  if sample_rate != 16000:
 
231
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
232
 
233
- # Speech to text with Whisper
234
- result = asr_pipe(audio_data, sampling_rate=16000)
 
 
 
 
235
  transcription = result['text'].strip()
 
236
 
237
- if not transcription:
238
- return "No speech detected", "neutral"
239
 
240
  # Detect emotion from transcription
241
  emotion = detect_emotion_from_text(transcription)
 
242
 
243
  return transcription, emotion
244
 
245
  except Exception as e:
246
- print(f"Error in STT: {e}")
247
- return "Sorry, I couldn't understand that.", "neutral"
248
 
249
  def generate_contextual_response(user_input, emotion, conversation_manager):
250
  """Enhanced response generation with better emotional intelligence"""
@@ -327,7 +314,7 @@ Guidelines:
327
  return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
328
 
329
  def text_to_speech_emotional(text, emotion="neutral"):
330
- """Enhanced TTS with support for both Dia and Coqui"""
331
  try:
332
  if tts_model is None:
333
  print(f"πŸ”Š Maya says ({emotion}): {text}")
@@ -378,38 +365,6 @@ def text_to_speech_emotional(text, emotion="neutral"):
378
  audio_output = audio_output / max_val * 0.95
379
 
380
  return (44100, audio_output)
381
-
382
- elif tts_type == "coqui":
383
- # Coqui TTS processing
384
- emotional_prefixes = {
385
- "happy": "[Speaking with joy] ",
386
- "sad": "[Speaking gently] ",
387
- "angry": "[Speaking calmly] ",
388
- "surprised": "[Speaking with excitement] ",
389
- "fearful": "[Speaking reassuringly] ",
390
- "disgusted": "[Speaking understandingly] ",
391
- "neutral": ""
392
- }
393
-
394
- enhanced_text = f"{emotional_prefixes.get(emotion, '')}{text}"
395
-
396
- print(f"Generating Coqui TTS for: {enhanced_text}")
397
-
398
- audio_output = tts_model.tts(text=enhanced_text)
399
-
400
- # Convert to numpy array if needed
401
- if isinstance(audio_output, list):
402
- audio_output = np.array(audio_output, dtype=np.float32)
403
- elif torch.is_tensor(audio_output):
404
- audio_output = audio_output.cpu().numpy().astype(np.float32)
405
-
406
- # Normalize audio
407
- if len(audio_output) > 0:
408
- max_val = np.max(np.abs(audio_output))
409
- if max_val > 1.0:
410
- audio_output = audio_output / max_val * 0.95
411
-
412
- return (22050, audio_output)
413
 
414
  else:
415
  # Text-only mode
@@ -439,11 +394,18 @@ def process_conversation(audio_input):
439
  return None, "Please record some audio first.", "", "❌ No audio input received."
440
 
441
  try:
 
 
442
  # Step 1: Speech to Text + Emotion Detection
443
  user_text, emotion = speech_to_text_with_emotion(audio_input)
444
 
445
- if not user_text or user_text.strip() == "" or "sorry" in user_text.lower():
446
- return None, "I didn't catch that clearly. Could you please speak a bit louder or closer to the microphone?", "", "❌ No clear speech detected."
 
 
 
 
 
447
 
448
  # Step 2: Generate contextual response
449
  ai_response = generate_contextual_response(user_text, emotion, conv_manager)
@@ -535,7 +497,7 @@ def create_interface():
535
  # Audio Input
536
  gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>🎀 Voice Input</h3>")
537
  audio_input = gr.Audio(
538
- label="Record Your Message",
539
  sources=["microphone"],
540
  type="numpy",
541
  format="wav"
@@ -620,7 +582,7 @@ def create_interface():
620
  <h4 style="color: #007bff;">πŸš€ Getting Started:</h4>
621
  <ol style="color: #495057;">
622
  <li><strong>Start Call:</strong> Click "πŸ“ž Start Call" to initialize Maya</li>
623
- <li><strong>Record:</strong> Use the microphone to record your message</li>
624
  <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
625
  <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
626
  <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
@@ -628,20 +590,26 @@ def create_interface():
628
  </ol>
629
  </div>
630
  <div>
631
- <h4 style="color: #28a745;">🎭 Advanced Features:</h4>
632
  <ul style="color: #495057;">
633
  <li>🎀 <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
634
  <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
635
- <li>🎭 <strong>Emotion Detection:</strong> Advanced emotion recognition from speech</li>
636
- <li>πŸ”Š <strong>Natural TTS:</strong> High-quality speech synthesis with Dia TTS</li>
637
- <li>πŸ’­ <strong>Context Memory:</strong> Remembers conversation flow and context</li>
638
- <li>❀️ <strong>Emotional Intelligence:</strong> Responds appropriately to your emotions</li>
639
  </ul>
640
  </div>
641
  </div>
642
 
643
  <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px; border-left: 4px solid #bee5eb;">
644
- <p style="margin: 0; color: #0c5460;"><strong>πŸ’‘ Pro Tip:</strong> Speak clearly and naturally. Maya can detect emotions like happiness, sadness, anger, surprise, fear, and disgust, and will respond accordingly to provide the best conversational experience!</p>
 
 
 
 
 
 
645
  </div>
646
  </div>
647
  """)
 
20
  print(f"⚠️ Dia TTS not available: {e}")
21
  DIA_AVAILABLE = False
22
 
 
 
 
 
 
 
 
 
 
23
  warnings.filterwarnings("ignore")
24
 
25
  # Global models
 
27
  qwen_model = None
28
  qwen_tokenizer = None
29
  tts_model = None
30
+ tts_type = None
31
 
32
  class ConversationManager:
33
  def __init__(self, max_exchanges=5):
 
75
 
76
  print("πŸš€ Loading Maya AI models...")
77
 
78
+ # Load ASR model (Whisper) - FIXED VERSION
79
  print("🎀 Loading Whisper for ASR...")
80
  try:
81
  asr_pipe = pipeline(
82
  "automatic-speech-recognition",
83
  model="openai/whisper-base",
84
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
85
+ device=0 if torch.cuda.is_available() else -1
86
+ # Removed return_timestamps and other problematic parameters
87
  )
88
  print("βœ… Whisper ASR loaded successfully!")
89
  except Exception as e:
 
110
  print(f"❌ Error loading Qwen: {e}")
111
  return False
112
 
113
+ # Load Dia TTS
 
 
 
114
  if DIA_AVAILABLE:
115
  try:
116
  print("Attempting to load Dia TTS...")
 
125
  print(f"⚠️ Dia TTS failed to load: {e}")
126
  tts_model = None
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  # Continue without TTS (text-only mode)
129
  print("⚠️ No TTS available, running in text-only mode")
130
  tts_type = "none"
 
166
  return 'neutral'
167
 
168
  def speech_to_text_with_emotion(audio_input):
169
+ """FIXED STT function with proper audio processing"""
170
  try:
171
  if audio_input is None:
172
  return "", "neutral"
173
 
174
+ print("🎀 Processing audio input...")
175
+
176
+ # Process audio input with enhanced handling
177
  if isinstance(audio_input, tuple):
178
  sample_rate, audio_data = audio_input
179
+ print(f"Audio input: sample_rate={sample_rate}, shape={audio_data.shape}, dtype={audio_data.dtype}")
180
 
181
  # Handle different audio formats
182
  if audio_data.dtype == np.int16:
 
195
 
196
  # Validate audio length
197
  if len(audio_data) < 1600: # Less than 0.1 seconds at 16kHz
198
+ return "Audio too short, please speak for at least 1 second", "neutral"
199
+
200
+ # Check for silence (audio with very low amplitude)
201
+ max_amplitude = np.max(np.abs(audio_data))
202
+ if max_amplitude < 0.01: # Very quiet audio
203
+ return "Audio too quiet, please speak louder", "neutral"
204
 
205
  # Normalize audio
206
+ if max_amplitude > 0:
207
+ audio_data = audio_data / max_amplitude * 0.95
 
 
208
 
209
+ # Resample to 16kHz if needed (Whisper expects 16kHz)
210
  if sample_rate != 16000:
211
+ print(f"Resampling from {sample_rate}Hz to 16000Hz...")
212
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
213
 
214
+ print(f"Final audio: length={len(audio_data)}, max_amplitude={np.max(np.abs(audio_data)):.3f}")
215
+
216
+ # FIXED: Call ASR pipeline without sampling_rate parameter
217
+ print("πŸ”„ Running Whisper ASR...")
218
+ result = asr_pipe(audio_data) # Removed sampling_rate parameter
219
+
220
  transcription = result['text'].strip()
221
+ print(f"Transcription: '{transcription}'")
222
 
223
+ if not transcription or len(transcription) < 2:
224
+ return "No clear speech detected, please try speaking more clearly", "neutral"
225
 
226
  # Detect emotion from transcription
227
  emotion = detect_emotion_from_text(transcription)
228
+ print(f"Detected emotion: {emotion}")
229
 
230
  return transcription, emotion
231
 
232
  except Exception as e:
233
+ print(f"❌ Error in STT: {e}")
234
+ return "Sorry, I couldn't understand that. Please try again.", "neutral"
235
 
236
  def generate_contextual_response(user_input, emotion, conversation_manager):
237
  """Enhanced response generation with better emotional intelligence"""
 
314
  return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
315
 
316
  def text_to_speech_emotional(text, emotion="neutral"):
317
+ """Enhanced TTS with Dia support"""
318
  try:
319
  if tts_model is None:
320
  print(f"πŸ”Š Maya says ({emotion}): {text}")
 
365
  audio_output = audio_output / max_val * 0.95
366
 
367
  return (44100, audio_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
  else:
370
  # Text-only mode
 
394
  return None, "Please record some audio first.", "", "❌ No audio input received."
395
 
396
  try:
397
+ print("πŸ”„ Processing conversation...")
398
+
399
  # Step 1: Speech to Text + Emotion Detection
400
  user_text, emotion = speech_to_text_with_emotion(audio_input)
401
 
402
+ # Check for error messages from STT
403
+ error_phrases = ["audio too short", "audio too quiet", "no clear speech", "sorry", "couldn't understand"]
404
+ if any(phrase in user_text.lower() for phrase in error_phrases):
405
+ return None, user_text, "", f"❌ STT Issue: {user_text}"
406
+
407
+ if not user_text or user_text.strip() == "":
408
+ return None, "I didn't catch that clearly. Could you please speak a bit louder and closer to the microphone?", "", "❌ No speech detected."
409
 
410
  # Step 2: Generate contextual response
411
  ai_response = generate_contextual_response(user_text, emotion, conv_manager)
 
497
  # Audio Input
498
  gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>🎀 Voice Input</h3>")
499
  audio_input = gr.Audio(
500
+ label="Record Your Message (Speak clearly for at least 2 seconds)",
501
  sources=["microphone"],
502
  type="numpy",
503
  format="wav"
 
582
  <h4 style="color: #007bff;">πŸš€ Getting Started:</h4>
583
  <ol style="color: #495057;">
584
  <li><strong>Start Call:</strong> Click "πŸ“ž Start Call" to initialize Maya</li>
585
+ <li><strong>Record:</strong> Speak clearly for at least 2 seconds</li>
586
  <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
587
  <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
588
  <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
 
590
  </ol>
591
  </div>
592
  <div>
593
+ <h4 style="color: #28a745;">🎭 Features:</h4>
594
  <ul style="color: #495057;">
595
  <li>🎀 <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
596
  <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
597
+ <li>🎭 <strong>Emotion Detection:</strong> Advanced emotion recognition</li>
598
+ <li>πŸ”Š <strong>Natural TTS:</strong> High-quality Dia TTS synthesis</li>
599
+ <li>πŸ’­ <strong>Context Memory:</strong> Remembers conversation flow</li>
600
+ <li>❀️ <strong>Emotional Intelligence:</strong> Responds to your emotions</li>
601
  </ul>
602
  </div>
603
  </div>
604
 
605
  <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px; border-left: 4px solid #bee5eb;">
606
+ <p style="margin: 0; color: #0c5460;"><strong>πŸ’‘ Pro Tips:</strong></p>
607
+ <ul style="color: #0c5460; margin: 10px 0;">
608
+ <li>Speak clearly and close to your microphone</li>
609
+ <li>Record for at least 2-3 seconds</li>
610
+ <li>Speak in a quiet environment for best results</li>
611
+ <li>Maya can detect emotions and respond accordingly!</li>
612
+ </ul>
613
  </div>
614
  </div>
615
  """)