Devakumar868 commited on
Commit
326e0ae
Β·
verified Β·
1 Parent(s): 95ae54e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -181
app.py CHANGED
@@ -75,7 +75,7 @@ def load_models():
75
 
76
  print("πŸš€ Loading Maya AI models...")
77
 
78
- # Load ASR model (Whisper) - FIXED VERSION
79
  print("🎀 Loading Whisper for ASR...")
80
  try:
81
  asr_pipe = pipeline(
@@ -83,7 +83,6 @@ def load_models():
83
  model="openai/whisper-base",
84
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
85
  device=0 if torch.cuda.is_available() else -1
86
- # Removed return_timestamps and other problematic parameters
87
  )
88
  print("βœ… Whisper ASR loaded successfully!")
89
  except Exception as e:
@@ -125,7 +124,6 @@ def load_models():
125
  print(f"⚠️ Dia TTS failed to load: {e}")
126
  tts_model = None
127
 
128
- # Continue without TTS (text-only mode)
129
  print("⚠️ No TTS available, running in text-only mode")
130
  tts_type = "none"
131
  return True
@@ -134,49 +132,39 @@ def detect_emotion_from_text(text):
134
  """Enhanced emotion detection from text"""
135
  text_lower = text.lower()
136
 
137
- # Enhanced emotion keywords with weights
138
  emotions = {
139
  'happy': ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing',
140
- 'fantastic', 'excellent', 'brilliant', 'perfect', 'love', 'joy', 'cheerful',
141
- 'delighted', 'thrilled', 'ecstatic'],
142
  'sad': ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed',
143
- 'miserable', 'heartbroken', 'devastated', 'gloomy', 'melancholy', 'down',
144
- 'blue', 'sorrowful'],
145
  'angry': ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate', 'rage',
146
- 'irritated', 'outraged', 'livid', 'enraged', 'pissed', 'irate'],
147
  'surprised': ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking',
148
- 'astonishing', 'remarkable', 'extraordinary', 'mind-blowing',
149
- 'amazing', 'stunning'],
150
- 'fearful': ['scared', 'afraid', 'terrified', 'worried', 'anxious', 'nervous',
151
- 'frightened', 'panic', 'concerned', 'fearful'],
152
- 'disgusted': ['disgusting', 'gross', 'revolting', 'sick', 'nauseating', 'repulsive',
153
- 'awful', 'horrible']
154
  }
155
 
156
- # Count emotion indicators
157
  emotion_scores = {}
158
  for emotion, keywords in emotions.items():
159
  score = sum(1 for keyword in keywords if keyword in text_lower)
160
  if score > 0:
161
  emotion_scores[emotion] = score
162
 
163
- # Return the emotion with highest score, or neutral if none found
164
  if emotion_scores:
165
  return max(emotion_scores, key=emotion_scores.get)
166
  return 'neutral'
167
 
168
  def speech_to_text_with_emotion(audio_input):
169
- """FIXED STT function with proper audio processing"""
170
  try:
171
  if audio_input is None:
172
  return "", "neutral"
173
 
174
  print("🎀 Processing audio input...")
175
 
176
- # Process audio input with enhanced handling
177
  if isinstance(audio_input, tuple):
178
  sample_rate, audio_data = audio_input
179
- print(f"Audio input: sample_rate={sample_rate}, shape={audio_data.shape}, dtype={audio_data.dtype}")
180
 
181
  # Handle different audio formats
182
  if audio_data.dtype == np.int16:
@@ -193,29 +181,25 @@ def speech_to_text_with_emotion(audio_input):
193
  audio_data = audio_input
194
  sample_rate = 16000
195
 
196
- # Validate audio length
197
- if len(audio_data) < 1600: # Less than 0.1 seconds at 16kHz
198
  return "Audio too short, please speak for at least 1 second", "neutral"
199
 
200
- # Check for silence (audio with very low amplitude)
201
  max_amplitude = np.max(np.abs(audio_data))
202
- if max_amplitude < 0.01: # Very quiet audio
203
  return "Audio too quiet, please speak louder", "neutral"
204
 
205
  # Normalize audio
206
  if max_amplitude > 0:
207
  audio_data = audio_data / max_amplitude * 0.95
208
 
209
- # Resample to 16kHz if needed (Whisper expects 16kHz)
210
  if sample_rate != 16000:
211
  print(f"Resampling from {sample_rate}Hz to 16000Hz...")
212
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
213
 
214
- print(f"Final audio: length={len(audio_data)}, max_amplitude={np.max(np.abs(audio_data)):.3f}")
215
-
216
- # FIXED: Call ASR pipeline without sampling_rate parameter
217
  print("πŸ”„ Running Whisper ASR...")
218
- result = asr_pipe(audio_data) # Removed sampling_rate parameter
219
 
220
  transcription = result['text'].strip()
221
  print(f"Transcription: '{transcription}'")
@@ -223,7 +207,6 @@ def speech_to_text_with_emotion(audio_input):
223
  if not transcription or len(transcription) < 2:
224
  return "No clear speech detected, please try speaking more clearly", "neutral"
225
 
226
- # Detect emotion from transcription
227
  emotion = detect_emotion_from_text(transcription)
228
  print(f"Detected emotion: {emotion}")
229
 
@@ -234,39 +217,30 @@ def speech_to_text_with_emotion(audio_input):
234
  return "Sorry, I couldn't understand that. Please try again.", "neutral"
235
 
236
  def generate_contextual_response(user_input, emotion, conversation_manager):
237
- """Enhanced response generation with better emotional intelligence"""
238
  try:
239
  context = conversation_manager.get_context()
240
 
241
- # Enhanced emotional response styles
242
  emotional_prompts = {
243
- "happy": "Respond with genuine enthusiasm and joy. Use positive language, show excitement, and celebrate with them. Be warm and energetic.",
244
- "sad": "Respond with deep empathy and comfort. Be gentle, understanding, and supportive. Offer comfort and hope without being dismissive.",
245
- "angry": "Respond calmly and try to help. Be patient, understanding, and try to de-escalate. Don't match their anger but acknowledge their feelings.",
246
- "surprised": "Share in their surprise and show curiosity. Be engaging, interested, and ask thoughtful follow-up questions.",
247
- "fearful": "Respond with reassurance and support. Be calming, protective, and offer practical help or comfort.",
248
- "disgusted": "Respond with understanding while being helpful. Acknowledge their feelings and try to redirect positively.",
249
- "neutral": "Respond naturally and conversationally. Be helpful, friendly, and engaging."
250
  }
251
 
252
- system_prompt = f"""You are Maya, a highly emotionally intelligent AI assistant with a warm, caring personality.
253
 
254
  {emotional_prompts.get(emotion, emotional_prompts['neutral'])}
255
 
256
- Previous conversation context:
257
- {context}
258
-
259
- Current user emotion detected: {emotion}
260
 
261
  Guidelines:
262
- - Keep responses concise but meaningful (1-2 sentences)
263
- - Match the user's emotional tone appropriately
264
  - Be natural and conversational
265
- - Show genuine empathy and understanding
266
- - Provide helpful and relevant responses
267
- - Use natural speech patterns
268
- - If they seem distressed, offer support
269
- - If they're happy, celebrate with them
270
  """
271
 
272
  messages = [
@@ -274,11 +248,8 @@ Guidelines:
274
  {"role": "user", "content": user_input}
275
  ]
276
 
277
- # Generate response with Qwen
278
  text = qwen_tokenizer.apply_chat_template(
279
- messages,
280
- tokenize=False,
281
- add_generation_prompt=True
282
  )
283
 
284
  model_inputs = qwen_tokenizer([text], return_tensors="pt")
@@ -301,9 +272,8 @@ Guidelines:
301
  ]
302
 
303
  response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
304
-
305
- # Clean up response
306
  response = response.strip()
 
307
  if response.startswith("Maya:"):
308
  response = response[5:].strip()
309
 
@@ -311,10 +281,10 @@ Guidelines:
311
 
312
  except Exception as e:
313
  print(f"Error in response generation: {e}")
314
- return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
315
 
316
  def text_to_speech_emotional(text, emotion="neutral"):
317
- """Enhanced TTS with Dia support"""
318
  try:
319
  if tts_model is None:
320
  print(f"πŸ”Š Maya says ({emotion}): {text}")
@@ -325,21 +295,18 @@ def text_to_speech_emotional(text, emotion="neutral"):
325
  torch.cuda.empty_cache()
326
 
327
  if tts_type == "dia":
328
- # Dia TTS with enhanced emotional markers
329
  emotional_markers = {
330
  "happy": "(excited) ",
331
  "sad": "(sad) ",
332
- "angry": "(calm) ", # Stay calm when user is angry
333
  "surprised": "(surprised) ",
334
- "fearful": "(reassuring) ",
335
- "disgusted": "(understanding) ",
336
  "neutral": ""
337
  }
338
 
339
- # Enhanced text processing for Dia
340
  enhanced_text = f"[S1] {emotional_markers.get(emotion, '')}{text}"
341
 
342
- # Add natural pauses for longer text
343
  if len(text) > 50:
344
  enhanced_text = enhanced_text.replace(". ", ". (pause) ")
345
  enhanced_text = enhanced_text.replace("! ", "! (pause) ")
@@ -354,25 +321,34 @@ def text_to_speech_emotional(text, emotion="neutral"):
354
  verbose=False
355
  )
356
 
357
- # Process Dia output
358
  if isinstance(audio_output, torch.Tensor):
359
  audio_output = audio_output.cpu().numpy()
360
 
361
- # Normalize audio
 
 
 
 
362
  if len(audio_output) > 0:
363
  max_val = np.max(np.abs(audio_output))
364
- if max_val > 1.0:
365
  audio_output = audio_output / max_val * 0.95
366
 
 
 
 
 
 
 
367
  return (44100, audio_output)
368
 
369
  else:
370
- # Text-only mode
371
  print(f"πŸ”Š Maya says ({emotion}): {text}")
372
  return None
373
 
374
  except Exception as e:
375
- print(f"Error in TTS: {e}")
376
  print(f"πŸ”Š Maya says ({emotion}): {text}")
377
  return None
378
 
@@ -382,53 +358,53 @@ conv_manager = ConversationManager()
382
  def start_call():
383
  """Initialize call and return greeting"""
384
  conv_manager.clear()
385
- greeting_text = "Hello! I'm Maya, your AI assistant. I'm here to chat and help you with anything you need. How are you feeling today?"
386
  greeting_audio = text_to_speech_emotional(greeting_text, "happy")
387
 
388
  tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
389
- return greeting_audio, greeting_text, f"πŸ“ž Call started! Maya is ready to chat. {tts_status}"
390
 
391
  def process_conversation(audio_input):
392
- """Enhanced conversation processing pipeline"""
393
  if audio_input is None:
394
  return None, "Please record some audio first.", "", "❌ No audio input received."
395
 
396
  try:
397
  print("πŸ”„ Processing conversation...")
398
 
399
- # Step 1: Speech to Text + Emotion Detection
400
  user_text, emotion = speech_to_text_with_emotion(audio_input)
401
 
402
- # Check for error messages from STT
403
  error_phrases = ["audio too short", "audio too quiet", "no clear speech", "sorry", "couldn't understand"]
404
  if any(phrase in user_text.lower() for phrase in error_phrases):
405
  return None, user_text, "", f"❌ STT Issue: {user_text}"
406
 
407
  if not user_text or user_text.strip() == "":
408
- return None, "I didn't catch that clearly. Could you please speak a bit louder and closer to the microphone?", "", "❌ No speech detected."
409
 
410
- # Step 2: Generate contextual response
411
  ai_response = generate_contextual_response(user_text, emotion, conv_manager)
412
 
413
- # Step 3: Convert to speech
414
  response_audio = text_to_speech_emotional(ai_response, emotion)
415
 
416
- # Step 4: Update conversation history
417
  conv_manager.add_exchange(user_text, ai_response, emotion)
418
 
419
- status = f"βœ… Processed successfully! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5 | TTS: {tts_type.upper()}"
420
 
421
  return response_audio, ai_response, user_text, status
422
 
423
  except Exception as e:
424
- error_msg = f"❌ Error processing conversation: {str(e)}"
425
  print(error_msg)
426
  return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
427
 
428
  def get_conversation_history():
429
- """Return formatted conversation history"""
430
  if not conv_manager.history:
431
- return "No conversation history yet. Start a call to begin chatting with Maya!"
432
 
433
  history_text = "πŸ“‹ **Conversation History:**\n\n"
434
  for i, exchange in enumerate(conv_manager.history, 1):
@@ -440,118 +416,84 @@ def get_conversation_history():
440
  return history_text
441
 
442
  def end_call():
443
- """End call and clear conversation"""
444
- farewell_text = "Thank you for our wonderful conversation! I really enjoyed talking with you. Take care and have an amazing day!"
445
  farewell_audio = text_to_speech_emotional(farewell_text, "happy")
446
  conv_manager.clear()
447
 
448
- return farewell_audio, farewell_text, "πŸ“žβŒ Call ended. Thank you for chatting with Maya!"
449
 
450
  def create_interface():
451
- """Create enhanced Gradio interface"""
452
  with gr.Blocks(
453
- title="Maya AI - Advanced Speech-to-Speech Assistant",
454
- theme=gr.themes.Soft(),
455
- css="""
456
- .main-header {
457
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
458
- border-radius: 15px;
459
- padding: 25px;
460
- text-align: center;
461
- margin-bottom: 25px;
462
- box-shadow: 0 8px 32px rgba(0,0,0,0.1);
463
- }
464
- .call-button {
465
- background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
466
- border: none !important;
467
- box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
468
- }
469
- .process-button {
470
- background: linear-gradient(45deg, #45B7D1, #96CEB4) !important;
471
- border: none !important;
472
- box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
473
- }
474
- .end-button {
475
- background: linear-gradient(45deg, #FFA07A, #FF6347) !important;
476
- border: none !important;
477
- box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
478
- }
479
- """
480
  ) as demo:
481
 
482
  gr.HTML("""
483
- <div class="main-header">
484
- <h1 style="color: white; margin: 0; font-size: 2.8em; font-weight: bold;">πŸŽ™οΈ Maya AI</h1>
485
  <p style="color: white; margin: 15px 0; font-size: 1.3em;">Advanced Speech-to-Speech Conversational AI</p>
486
- <p style="color: #E8E8E8; margin: 0; font-size: 1.1em;">Natural β€’ Emotional β€’ Contextual β€’ Intelligent</p>
487
  </div>
488
  """)
489
 
490
  with gr.Row():
491
  with gr.Column(scale=1):
492
- # Call Controls
493
- gr.HTML("<h3 style='color: #333; margin-bottom: 15px;'>πŸ“ž Call Controls</h3>")
494
- start_btn = gr.Button("πŸ“ž Start Call", elem_classes="call-button", size="lg")
495
- end_btn = gr.Button("πŸ“žβŒ End Call", elem_classes="end-button", size="lg")
496
 
497
- # Audio Input
498
- gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>🎀 Voice Input</h3>")
499
  audio_input = gr.Audio(
500
- label="Record Your Message (Speak clearly for at least 2 seconds)",
501
  sources=["microphone"],
502
- type="numpy",
503
- format="wav"
504
  )
505
 
506
- process_btn = gr.Button("🎯 Process Message", elem_classes="process-button", variant="primary", size="lg")
507
 
508
- # Status Display
509
  status_display = gr.Textbox(
510
  label="πŸ“Š System Status",
511
  interactive=False,
512
  lines=3,
513
- value="πŸš€ System ready! Click 'Start Call' to begin your conversation with Maya."
514
  )
515
 
516
  with gr.Column(scale=2):
517
- # AI Response Audio
518
- gr.HTML("<h3 style='color: #333; margin-bottom: 15px;'>πŸ”Š Maya's Response</h3>")
519
  response_audio = gr.Audio(
520
  label="Maya's Voice Response",
521
  type="numpy",
522
  interactive=False,
523
- autoplay=True
 
 
524
  )
525
 
526
- # Text Displays
527
  with gr.Row():
528
  with gr.Column():
529
  user_text_display = gr.Textbox(
530
  label="πŸ‘€ What You Said",
531
  interactive=False,
532
- lines=4,
533
- placeholder="Your speech will appear here after processing..."
534
  )
535
 
536
  with gr.Column():
537
  ai_text_display = gr.Textbox(
538
  label="πŸ€– Maya's Response",
539
  interactive=False,
540
- lines=4,
541
- placeholder="Maya's response will appear here..."
542
  )
543
 
544
- # Conversation History Section
545
  with gr.Row():
546
  with gr.Column():
547
- gr.HTML("<h3 style='color: #333; margin: 25px 0 15px 0;'>πŸ“‹ Conversation History</h3>")
548
- history_btn = gr.Button("πŸ“‹ Show Conversation History", variant="secondary", size="lg")
549
- history_display = gr.Markdown(
550
- value="No conversation history yet. Start a call to begin chatting with Maya!",
551
- label="Conversation Log"
552
- )
553
 
554
- # Event Handlers
555
  start_btn.click(
556
  fn=start_call,
557
  outputs=[response_audio, ai_text_display, status_display]
@@ -573,42 +515,26 @@ def create_interface():
573
  outputs=[history_display]
574
  )
575
 
576
- # Enhanced Instructions
577
  gr.HTML("""
578
- <div style="margin-top: 30px; padding: 25px; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); border-radius: 15px; border: 1px solid #dee2e6;">
579
- <h3 style="color: #495057; margin-bottom: 20px;">πŸ’‘ How to Use Maya AI:</h3>
580
- <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
581
- <div>
582
- <h4 style="color: #007bff;">πŸš€ Getting Started:</h4>
583
- <ol style="color: #495057;">
584
- <li><strong>Start Call:</strong> Click "πŸ“ž Start Call" to initialize Maya</li>
585
- <li><strong>Record:</strong> Speak clearly for at least 2 seconds</li>
586
- <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
587
- <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
588
- <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
589
- <li><strong>End:</strong> Click "πŸ“žβŒ End Call" when finished</li>
590
- </ol>
591
- </div>
592
- <div>
593
- <h4 style="color: #28a745;">🎭 Features:</h4>
594
- <ul style="color: #495057;">
595
- <li>🎀 <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
596
- <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
597
- <li>🎭 <strong>Emotion Detection:</strong> Advanced emotion recognition</li>
598
- <li>πŸ”Š <strong>Natural TTS:</strong> High-quality Dia TTS synthesis</li>
599
- <li>πŸ’­ <strong>Context Memory:</strong> Remembers conversation flow</li>
600
- <li>❀️ <strong>Emotional Intelligence:</strong> Responds to your emotions</li>
601
- </ul>
602
- </div>
603
- </div>
604
 
605
- <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px; border-left: 4px solid #bee5eb;">
606
- <p style="margin: 0; color: #0c5460;"><strong>πŸ’‘ Pro Tips:</strong></p>
607
- <ul style="color: #0c5460; margin: 10px 0;">
608
  <li>Speak clearly and close to your microphone</li>
609
  <li>Record for at least 2-3 seconds</li>
610
- <li>Speak in a quiet environment for best results</li>
611
- <li>Maya can detect emotions and respond accordingly!</li>
612
  </ul>
613
  </div>
614
  </div>
@@ -619,7 +545,6 @@ def create_interface():
619
  if __name__ == "__main__":
620
  print("πŸš€ Initializing Maya AI System...")
621
 
622
- # Check system info
623
  check_system_info()
624
 
625
  if load_models():
@@ -632,8 +557,7 @@ if __name__ == "__main__":
632
  server_name="0.0.0.0",
633
  server_port=7860,
634
  share=True,
635
- show_error=True,
636
- debug=False
637
  )
638
  else:
639
- print("❌ Failed to load models. Please check the logs above for details.")
 
75
 
76
  print("πŸš€ Loading Maya AI models...")
77
 
78
+ # Load ASR model (Whisper)
79
  print("🎀 Loading Whisper for ASR...")
80
  try:
81
  asr_pipe = pipeline(
 
83
  model="openai/whisper-base",
84
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
85
  device=0 if torch.cuda.is_available() else -1
 
86
  )
87
  print("βœ… Whisper ASR loaded successfully!")
88
  except Exception as e:
 
124
  print(f"⚠️ Dia TTS failed to load: {e}")
125
  tts_model = None
126
 
 
127
  print("⚠️ No TTS available, running in text-only mode")
128
  tts_type = "none"
129
  return True
 
132
  """Enhanced emotion detection from text"""
133
  text_lower = text.lower()
134
 
 
135
  emotions = {
136
  'happy': ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing',
137
+ 'fantastic', 'excellent', 'brilliant', 'perfect', 'love', 'joy', 'cheerful'],
 
138
  'sad': ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed',
139
+ 'miserable', 'heartbroken', 'devastated', 'gloomy', 'melancholy'],
 
140
  'angry': ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate', 'rage',
141
+ 'irritated', 'outraged', 'livid', 'enraged'],
142
  'surprised': ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking',
143
+ 'astonishing', 'remarkable', 'extraordinary', 'mind-blowing'],
144
+ 'neutral': []
 
 
 
 
145
  }
146
 
 
147
  emotion_scores = {}
148
  for emotion, keywords in emotions.items():
149
  score = sum(1 for keyword in keywords if keyword in text_lower)
150
  if score > 0:
151
  emotion_scores[emotion] = score
152
 
 
153
  if emotion_scores:
154
  return max(emotion_scores, key=emotion_scores.get)
155
  return 'neutral'
156
 
157
  def speech_to_text_with_emotion(audio_input):
158
+ """Enhanced STT with proper audio processing"""
159
  try:
160
  if audio_input is None:
161
  return "", "neutral"
162
 
163
  print("🎀 Processing audio input...")
164
 
 
165
  if isinstance(audio_input, tuple):
166
  sample_rate, audio_data = audio_input
167
+ print(f"Audio input: sample_rate={sample_rate}, shape={audio_data.shape}")
168
 
169
  # Handle different audio formats
170
  if audio_data.dtype == np.int16:
 
181
  audio_data = audio_input
182
  sample_rate = 16000
183
 
184
+ # Validate audio
185
+ if len(audio_data) < 1600:
186
  return "Audio too short, please speak for at least 1 second", "neutral"
187
 
 
188
  max_amplitude = np.max(np.abs(audio_data))
189
+ if max_amplitude < 0.01:
190
  return "Audio too quiet, please speak louder", "neutral"
191
 
192
  # Normalize audio
193
  if max_amplitude > 0:
194
  audio_data = audio_data / max_amplitude * 0.95
195
 
196
+ # Resample to 16kHz if needed
197
  if sample_rate != 16000:
198
  print(f"Resampling from {sample_rate}Hz to 16000Hz...")
199
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
200
 
 
 
 
201
  print("πŸ”„ Running Whisper ASR...")
202
+ result = asr_pipe(audio_data)
203
 
204
  transcription = result['text'].strip()
205
  print(f"Transcription: '{transcription}'")
 
207
  if not transcription or len(transcription) < 2:
208
  return "No clear speech detected, please try speaking more clearly", "neutral"
209
 
 
210
  emotion = detect_emotion_from_text(transcription)
211
  print(f"Detected emotion: {emotion}")
212
 
 
217
  return "Sorry, I couldn't understand that. Please try again.", "neutral"
218
 
219
  def generate_contextual_response(user_input, emotion, conversation_manager):
220
+ """Enhanced response generation"""
221
  try:
222
  context = conversation_manager.get_context()
223
 
 
224
  emotional_prompts = {
225
+ "happy": "Respond with genuine enthusiasm and joy. Use positive language and show excitement.",
226
+ "sad": "Respond with empathy and comfort. Be gentle and understanding.",
227
+ "angry": "Respond calmly and try to help. Be patient and de-escalate.",
228
+ "surprised": "Share in their surprise and show curiosity. Be engaging.",
229
+ "neutral": "Respond naturally and conversationally. Be helpful and friendly."
 
 
230
  }
231
 
232
+ system_prompt = f"""You are Maya, a friendly AI assistant with emotional intelligence.
233
 
234
  {emotional_prompts.get(emotion, emotional_prompts['neutral'])}
235
 
236
+ Previous context: {context}
237
+ User emotion: {emotion}
 
 
238
 
239
  Guidelines:
240
+ - Keep responses concise (1-2 sentences)
 
241
  - Be natural and conversational
242
+ - Show empathy and understanding
243
+ - Provide helpful responses
 
 
 
244
  """
245
 
246
  messages = [
 
248
  {"role": "user", "content": user_input}
249
  ]
250
 
 
251
  text = qwen_tokenizer.apply_chat_template(
252
+ messages, tokenize=False, add_generation_prompt=True
 
 
253
  )
254
 
255
  model_inputs = qwen_tokenizer([text], return_tensors="pt")
 
272
  ]
273
 
274
  response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
275
  response = response.strip()
276
+
277
  if response.startswith("Maya:"):
278
  response = response[5:].strip()
279
 
 
281
 
282
  except Exception as e:
283
  print(f"Error in response generation: {e}")
284
+ return "I'm sorry, I'm having trouble processing that right now."
285
 
286
  def text_to_speech_emotional(text, emotion="neutral"):
287
+ """FIXED TTS with proper audio format for Gradio"""
288
  try:
289
  if tts_model is None:
290
  print(f"πŸ”Š Maya says ({emotion}): {text}")
 
295
  torch.cuda.empty_cache()
296
 
297
  if tts_type == "dia":
 
298
  emotional_markers = {
299
  "happy": "(excited) ",
300
  "sad": "(sad) ",
301
+ "angry": "(calm) ",
302
  "surprised": "(surprised) ",
 
 
303
  "neutral": ""
304
  }
305
 
306
+ # Enhanced text for Dia
307
  enhanced_text = f"[S1] {emotional_markers.get(emotion, '')}{text}"
308
 
309
+ # Add pauses for natural speech
310
  if len(text) > 50:
311
  enhanced_text = enhanced_text.replace(". ", ". (pause) ")
312
  enhanced_text = enhanced_text.replace("! ", "! (pause) ")
 
321
  verbose=False
322
  )
323
 
324
+ # FIXED: Proper audio processing for Gradio
325
  if isinstance(audio_output, torch.Tensor):
326
  audio_output = audio_output.cpu().numpy()
327
 
328
+ # Ensure audio is in the right format
329
+ if len(audio_output.shape) > 1:
330
+ audio_output = audio_output.squeeze()
331
+
332
+ # Normalize audio properly
333
  if len(audio_output) > 0:
334
  max_val = np.max(np.abs(audio_output))
335
+ if max_val > 0:
336
  audio_output = audio_output / max_val * 0.95
337
 
338
+ # CRITICAL FIX: Ensure audio is float32 and in correct range
339
+ audio_output = audio_output.astype(np.float32)
340
+
341
+ print(f"βœ… Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
342
+
343
+ # Return in format Gradio expects: (sample_rate, audio_array)
344
  return (44100, audio_output)
345
 
346
  else:
 
347
  print(f"πŸ”Š Maya says ({emotion}): {text}")
348
  return None
349
 
350
  except Exception as e:
351
+ print(f"❌ Error in TTS: {e}")
352
  print(f"πŸ”Š Maya says ({emotion}): {text}")
353
  return None
354
 
 
358
  def start_call():
359
  """Initialize call and return greeting"""
360
  conv_manager.clear()
361
+ greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
362
  greeting_audio = text_to_speech_emotional(greeting_text, "happy")
363
 
364
  tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
365
+ return greeting_audio, greeting_text, f"πŸ“ž Call started! Maya is ready. {tts_status}"
366
 
367
  def process_conversation(audio_input):
368
+ """Main conversation processing pipeline"""
369
  if audio_input is None:
370
  return None, "Please record some audio first.", "", "❌ No audio input received."
371
 
372
  try:
373
  print("πŸ”„ Processing conversation...")
374
 
375
+ # STT + Emotion Detection
376
  user_text, emotion = speech_to_text_with_emotion(audio_input)
377
 
378
+ # Check for STT errors
379
  error_phrases = ["audio too short", "audio too quiet", "no clear speech", "sorry", "couldn't understand"]
380
  if any(phrase in user_text.lower() for phrase in error_phrases):
381
  return None, user_text, "", f"❌ STT Issue: {user_text}"
382
 
383
  if not user_text or user_text.strip() == "":
384
+ return None, "I didn't catch that. Please speak louder and closer to the microphone.", "", "❌ No speech detected."
385
 
386
+ # Generate response
387
  ai_response = generate_contextual_response(user_text, emotion, conv_manager)
388
 
389
+ # Convert to speech
390
  response_audio = text_to_speech_emotional(ai_response, emotion)
391
 
392
+ # Update history
393
  conv_manager.add_exchange(user_text, ai_response, emotion)
394
 
395
+ status = f"βœ… Success! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5 | TTS: {tts_type.upper()}"
396
 
397
  return response_audio, ai_response, user_text, status
398
 
399
  except Exception as e:
400
+ error_msg = f"❌ Error: {str(e)}"
401
  print(error_msg)
402
  return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
403
 
404
  def get_conversation_history():
405
+ """Return conversation history"""
406
  if not conv_manager.history:
407
+ return "No conversation history yet. Start a call to begin!"
408
 
409
  history_text = "πŸ“‹ **Conversation History:**\n\n"
410
  for i, exchange in enumerate(conv_manager.history, 1):
 
416
  return history_text
417
 
418
  def end_call():
419
+ """End call"""
420
+ farewell_text = "Thank you for talking with me! Have a wonderful day!"
421
  farewell_audio = text_to_speech_emotional(farewell_text, "happy")
422
  conv_manager.clear()
423
 
424
+ return farewell_audio, farewell_text, "πŸ“žβŒ Call ended. Thank you!"
425
 
426
  def create_interface():
427
+ """Create Gradio interface with FIXED audio components"""
428
  with gr.Blocks(
429
+ title="Maya AI - Speech-to-Speech Assistant",
430
+ theme=gr.themes.Soft()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  ) as demo:
432
 
433
  gr.HTML("""
434
+ <div style="text-align: center; padding: 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 25px;">
435
+ <h1 style="color: white; margin: 0; font-size: 2.8em;">πŸŽ™οΈ Maya AI</h1>
436
  <p style="color: white; margin: 15px 0; font-size: 1.3em;">Advanced Speech-to-Speech Conversational AI</p>
437
+ <p style="color: #E8E8E8; margin: 0;">Natural β€’ Emotional β€’ Contextual β€’ Intelligent</p>
438
  </div>
439
  """)
440
 
441
  with gr.Row():
442
  with gr.Column(scale=1):
443
+ gr.HTML("<h3>πŸ“ž Call Controls</h3>")
444
+ start_btn = gr.Button("πŸ“ž Start Call", variant="primary", size="lg")
445
+ end_btn = gr.Button("πŸ“žβŒ End Call", variant="secondary", size="lg")
 
446
 
447
+ gr.HTML("<h3>🎀 Voice Input</h3>")
 
448
  audio_input = gr.Audio(
449
+ label="Record Your Message (Speak clearly for 2+ seconds)",
450
  sources=["microphone"],
451
+ type="numpy"
 
452
  )
453
 
454
+ process_btn = gr.Button("🎯 Process Message", variant="primary", size="lg")
455
 
 
456
  status_display = gr.Textbox(
457
  label="πŸ“Š System Status",
458
  interactive=False,
459
  lines=3,
460
+ value="πŸš€ Ready! Click 'Start Call' to begin."
461
  )
462
 
463
  with gr.Column(scale=2):
464
+ gr.HTML("<h3>πŸ”Š Maya's Response</h3>")
465
+ # FIXED: Audio component with proper settings
466
  response_audio = gr.Audio(
467
  label="Maya's Voice Response",
468
  type="numpy",
469
  interactive=False,
470
+ autoplay=True, # Enable autoplay
471
+ show_download_button=True,
472
+ show_share_button=False
473
  )
474
 
 
475
  with gr.Row():
476
  with gr.Column():
477
  user_text_display = gr.Textbox(
478
  label="πŸ‘€ What You Said",
479
  interactive=False,
480
+ lines=4
 
481
  )
482
 
483
  with gr.Column():
484
  ai_text_display = gr.Textbox(
485
  label="πŸ€– Maya's Response",
486
  interactive=False,
487
+ lines=4
 
488
  )
489
 
 
490
  with gr.Row():
491
  with gr.Column():
492
+ gr.HTML("<h3>πŸ“‹ Conversation History</h3>")
493
+ history_btn = gr.Button("πŸ“‹ Show History", variant="secondary")
494
+ history_display = gr.Markdown("No conversation history yet.")
 
 
 
495
 
496
+ # Event handlers
497
  start_btn.click(
498
  fn=start_call,
499
  outputs=[response_audio, ai_text_display, status_display]
 
515
  outputs=[history_display]
516
  )
517
 
518
+ # Instructions
519
  gr.HTML("""
520
+ <div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
521
+ <h3>πŸ’‘ How to Use Maya AI:</h3>
522
+ <ol>
523
+ <li><strong>Start Call:</strong> Click "πŸ“ž Start Call" - Maya will greet you</li>
524
+ <li><strong>Record:</strong> Speak clearly for at least 2 seconds</li>
525
+ <li><strong>Process:</strong> Click "🎯 Process Message"</li>
526
+ <li><strong>Listen:</strong> Maya will respond with natural speech</li>
527
+ <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges)</li>
528
+ <li><strong>End:</strong> Click "πŸ“žβŒ End Call" when done</li>
529
+ </ol>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
 
531
+ <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
532
+ <p><strong>πŸ’‘ Pro Tips:</strong></p>
533
+ <ul>
534
  <li>Speak clearly and close to your microphone</li>
535
  <li>Record for at least 2-3 seconds</li>
536
+ <li>Use a quiet environment for best results</li>
537
+ <li>Maya detects emotions and responds accordingly!</li>
538
  </ul>
539
  </div>
540
  </div>
 
545
  if __name__ == "__main__":
546
  print("πŸš€ Initializing Maya AI System...")
547
 
 
548
  check_system_info()
549
 
550
  if load_models():
 
557
  server_name="0.0.0.0",
558
  server_port=7860,
559
  share=True,
560
+ show_error=True
 
561
  )
562
  else:
563
+ print("❌ Failed to load models.")