Devakumar868 commited on
Commit
ef9cdda
Β·
verified Β·
1 Parent(s): aa73355

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +314 -150
app.py CHANGED
@@ -9,14 +9,25 @@ import json
9
  import time
10
  from datetime import datetime
11
  import os
 
12
 
13
- # Import TTS with fallback
 
 
 
 
 
 
 
 
 
14
  try:
15
  from TTS.api import TTS
16
- TTS_AVAILABLE = True
 
17
  except ImportError:
18
- print("⚠️ TTS not available, using text-only mode")
19
- TTS_AVAILABLE = False
20
 
21
  warnings.filterwarnings("ignore")
22
 
@@ -25,7 +36,7 @@ asr_pipe = None
25
  qwen_model = None
26
  qwen_tokenizer = None
27
  tts_model = None
28
- conversation_history = []
29
 
30
  class ConversationManager:
31
  def __init__(self, max_exchanges=5):
@@ -54,20 +65,34 @@ class ConversationManager:
54
  self.history = []
55
  self.current_emotion = "neutral"
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def load_models():
58
- """Load all models with proper error handling"""
59
- global asr_pipe, qwen_model, qwen_tokenizer, tts_model
60
 
61
- print("πŸš€ Loading models...")
62
 
63
- # Load ASR model
64
  print("🎀 Loading Whisper for ASR...")
65
  try:
66
  asr_pipe = pipeline(
67
  "automatic-speech-recognition",
68
  model="openai/whisper-base",
69
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
70
- device=0 if torch.cuda.is_available() else -1
 
71
  )
72
  print("βœ… Whisper ASR loaded successfully!")
73
  except Exception as e:
@@ -86,79 +111,132 @@ def load_models():
86
  model_name,
87
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
88
  device_map="auto" if torch.cuda.is_available() else None,
89
- trust_remote_code=True
 
90
  )
91
  print("βœ… Qwen loaded successfully!")
92
  except Exception as e:
93
  print(f"❌ Error loading Qwen: {e}")
94
  return False
95
 
96
- # Load TTS model
97
  print("πŸŽ™οΈ Loading TTS model...")
98
- if TTS_AVAILABLE:
 
 
99
  try:
100
- # Use Coqui TTS with a good female voice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
102
  if torch.cuda.is_available():
103
  tts_model = tts_model.to("cuda")
104
- print("βœ… TTS loaded successfully!")
 
 
105
  except Exception as e:
106
- print(f"⚠️ TTS failed to load: {e}")
107
  tts_model = None
108
- else:
109
- print("⚠️ TTS not available, using text-only mode")
110
- tts_model = None
111
 
 
 
 
112
  return True
113
 
114
  def detect_emotion_from_text(text):
115
- """Simple emotion detection from text"""
116
  text_lower = text.lower()
117
 
118
- # Emotion keywords
119
- if any(word in text_lower for word in ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing', 'fantastic']):
120
- return 'happy'
121
- elif any(word in text_lower for word in ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed']):
122
- return 'sad'
123
- elif any(word in text_lower for word in ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate']):
124
- return 'angry'
125
- elif any(word in text_lower for word in ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking']):
126
- return 'surprised'
127
- else:
128
- return 'neutral'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  def speech_to_text_with_emotion(audio_input):
131
- """Convert speech to text and detect emotion"""
132
  try:
133
  if audio_input is None:
134
  return "", "neutral"
135
 
136
- # Process audio input
137
  if isinstance(audio_input, tuple):
138
  sample_rate, audio_data = audio_input
139
- # Convert to float32 and handle stereo
140
- if audio_data.dtype != np.float32:
 
 
 
 
 
141
  audio_data = audio_data.astype(np.float32)
 
 
142
  if len(audio_data.shape) > 1:
143
  audio_data = audio_data.mean(axis=1)
144
  else:
145
  audio_data = audio_input
146
  sample_rate = 16000
147
 
 
 
 
 
148
  # Normalize audio
149
  if len(audio_data) > 0:
150
  max_val = np.max(np.abs(audio_data))
151
  if max_val > 0:
152
- audio_data = audio_data / max_val
153
 
154
  # Resample to 16kHz if needed
155
  if sample_rate != 16000:
156
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
157
 
158
- # Speech to text
159
  result = asr_pipe(audio_data, sampling_rate=16000)
160
  transcription = result['text'].strip()
161
 
 
 
 
162
  # Detect emotion from transcription
163
  emotion = detect_emotion_from_text(transcription)
164
 
@@ -169,41 +247,47 @@ def speech_to_text_with_emotion(audio_input):
169
  return "Sorry, I couldn't understand that.", "neutral"
170
 
171
  def generate_contextual_response(user_input, emotion, conversation_manager):
172
- """Generate contextual response using Qwen"""
173
  try:
174
  context = conversation_manager.get_context()
175
 
176
- # Emotional response styles
177
  emotional_prompts = {
178
- "happy": "Respond with enthusiasm and joy. Use positive language and show excitement.",
179
- "sad": "Respond with empathy and comfort. Be gentle, understanding, and supportive.",
180
- "angry": "Respond calmly and try to help. Be patient and de-escalate the situation.",
181
- "surprised": "Share in the surprise and show curiosity. Be engaging and interested.",
182
- "neutral": "Respond naturally and conversationally. Be helpful and friendly."
 
 
183
  }
184
 
185
- system_prompt = f"""You are Maya, a friendly and emotionally intelligent AI assistant.
186
- {emotional_prompts.get(emotion, emotional_prompts['neutral'])}
187
-
188
- Previous conversation context:
189
- {context}
190
-
191
- Current user emotion: {emotion}
192
-
193
- Guidelines:
194
- - Keep responses concise (1-2 sentences maximum)
195
- - Match the user's emotional tone appropriately
196
- - Be natural and conversational
197
- - Show empathy and understanding
198
- - Provide helpful responses
199
- """
 
 
 
 
200
 
201
  messages = [
202
  {"role": "system", "content": system_prompt},
203
  {"role": "user", "content": user_input}
204
  ]
205
 
206
- # Generate response
207
  text = qwen_tokenizer.apply_chat_template(
208
  messages,
209
  tokenize=False,
@@ -217,10 +301,11 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
217
  with torch.no_grad():
218
  generated_ids = qwen_model.generate(
219
  model_inputs.input_ids,
220
- max_new_tokens=80,
221
  do_sample=True,
222
  temperature=0.7,
223
  top_p=0.9,
 
224
  pad_token_id=qwen_tokenizer.eos_token_id
225
  )
226
 
@@ -230,14 +315,19 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
230
 
231
  response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
232
 
233
- return response.strip()
 
 
 
 
 
234
 
235
  except Exception as e:
236
  print(f"Error in response generation: {e}")
237
  return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
238
 
239
  def text_to_speech_emotional(text, emotion="neutral"):
240
- """Convert text to speech with emotional context"""
241
  try:
242
  if tts_model is None:
243
  print(f"πŸ”Š Maya says ({emotion}): {text}")
@@ -247,36 +337,85 @@ def text_to_speech_emotional(text, emotion="neutral"):
247
  if torch.cuda.is_available():
248
  torch.cuda.empty_cache()
249
 
250
- # Add emotional context to text
251
- emotional_prefixes = {
252
- "happy": "[Speaking with joy] ",
253
- "sad": "[Speaking gently] ",
254
- "angry": "[Speaking calmly] ",
255
- "surprised": "[Speaking with excitement] ",
256
- "neutral": ""
257
- }
258
-
259
- enhanced_text = f"{emotional_prefixes.get(emotion, '')}{text}"
260
-
261
- print(f"Generating TTS for: {enhanced_text}")
262
-
263
- # Generate audio
264
- audio_output = tts_model.tts(text=enhanced_text)
265
-
266
- # Convert to numpy array if needed
267
- if isinstance(audio_output, list):
268
- audio_output = np.array(audio_output, dtype=np.float32)
269
- elif torch.is_tensor(audio_output):
270
- audio_output = audio_output.cpu().numpy().astype(np.float32)
271
-
272
- # Normalize audio
273
- if len(audio_output) > 0:
274
- max_val = np.max(np.abs(audio_output))
275
- if max_val > 1.0:
276
- audio_output = audio_output / max_val * 0.95
277
-
278
- return (22050, audio_output) # Return sample rate and audio data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
 
 
 
 
 
280
  except Exception as e:
281
  print(f"Error in TTS: {e}")
282
  print(f"πŸ”Š Maya says ({emotion}): {text}")
@@ -288,13 +427,14 @@ conv_manager = ConversationManager()
288
  def start_call():
289
  """Initialize call and return greeting"""
290
  conv_manager.clear()
291
- greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
292
  greeting_audio = text_to_speech_emotional(greeting_text, "happy")
293
 
294
- return greeting_audio, greeting_text, "Call started! πŸ“ž Ready to chat!"
 
295
 
296
  def process_conversation(audio_input):
297
- """Main conversation processing pipeline"""
298
  if audio_input is None:
299
  return None, "Please record some audio first.", "", "❌ No audio input received."
300
 
@@ -302,8 +442,8 @@ def process_conversation(audio_input):
302
  # Step 1: Speech to Text + Emotion Detection
303
  user_text, emotion = speech_to_text_with_emotion(audio_input)
304
 
305
- if not user_text or user_text.strip() == "":
306
- return None, "I didn't catch that. Could you please repeat?", "", "❌ No speech detected."
307
 
308
  # Step 2: Generate contextual response
309
  ai_response = generate_contextual_response(user_text, emotion, conv_manager)
@@ -314,18 +454,19 @@ def process_conversation(audio_input):
314
  # Step 4: Update conversation history
315
  conv_manager.add_exchange(user_text, ai_response, emotion)
316
 
317
- status = f"βœ… Processed successfully! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
318
 
319
  return response_audio, ai_response, user_text, status
320
 
321
  except Exception as e:
322
  error_msg = f"❌ Error processing conversation: {str(e)}"
 
323
  return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
324
 
325
  def get_conversation_history():
326
  """Return formatted conversation history"""
327
  if not conv_manager.history:
328
- return "No conversation history yet. Start a call to begin chatting!"
329
 
330
  history_text = "πŸ“‹ **Conversation History:**\n\n"
331
  for i, exchange in enumerate(conv_manager.history, 1):
@@ -338,71 +479,86 @@ def get_conversation_history():
338
 
339
  def end_call():
340
  """End call and clear conversation"""
341
- farewell_text = "Thank you for talking with me! Have a wonderful day!"
342
  farewell_audio = text_to_speech_emotional(farewell_text, "happy")
343
  conv_manager.clear()
344
 
345
- return farewell_audio, farewell_text, "Call ended. πŸ“žβŒ Thanks for chatting!"
346
 
347
  def create_interface():
348
- """Create the Gradio interface"""
349
  with gr.Blocks(
350
- title="Maya AI - Speech-to-Speech Assistant",
351
  theme=gr.themes.Soft(),
352
  css="""
353
  .main-header {
354
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
355
  border-radius: 15px;
356
- padding: 20px;
357
  text-align: center;
358
- margin-bottom: 20px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  }
360
- .call-button { background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important; }
361
- .process-button { background: linear-gradient(45deg, #45B7D1, #96CEB4) !important; }
362
- .end-button { background: linear-gradient(45deg, #FFA07A, #FF6347) !important; }
363
  """
364
  ) as demo:
365
 
366
  gr.HTML("""
367
  <div class="main-header">
368
- <h1 style="color: white; margin: 0; font-size: 2.5em;">πŸŽ™οΈ Maya AI</h1>
369
- <p style="color: white; margin: 10px 0; font-size: 1.2em;">Advanced Speech-to-Speech Conversational AI</p>
370
- <p style="color: #E8E8E8; margin: 0;">Natural β€’ Emotional β€’ Contextual</p>
371
  </div>
372
  """)
373
 
374
  with gr.Row():
375
  with gr.Column(scale=1):
376
  # Call Controls
377
- gr.HTML("<h3>πŸ“ž Call Controls</h3>")
378
  start_btn = gr.Button("πŸ“ž Start Call", elem_classes="call-button", size="lg")
379
  end_btn = gr.Button("πŸ“žβŒ End Call", elem_classes="end-button", size="lg")
380
 
381
  # Audio Input
382
- gr.HTML("<h3>🎀 Voice Input</h3>")
383
  audio_input = gr.Audio(
384
  label="Record Your Message",
385
  sources=["microphone"],
386
- type="numpy"
 
387
  )
388
 
389
  process_btn = gr.Button("🎯 Process Message", elem_classes="process-button", variant="primary", size="lg")
390
 
391
  # Status Display
392
  status_display = gr.Textbox(
393
- label="πŸ“Š Status",
394
  interactive=False,
395
- lines=2,
396
- value="Ready to start! Click 'Start Call' to begin."
397
  )
398
 
399
  with gr.Column(scale=2):
400
  # AI Response Audio
401
- gr.HTML("<h3>πŸ”Š Maya's Response</h3>")
402
  response_audio = gr.Audio(
403
  label="Maya's Voice Response",
404
  type="numpy",
405
- interactive=False
 
406
  )
407
 
408
  # Text Displays
@@ -411,25 +567,25 @@ def create_interface():
411
  user_text_display = gr.Textbox(
412
  label="πŸ‘€ What You Said",
413
  interactive=False,
414
- lines=3,
415
- placeholder="Your speech will appear here..."
416
  )
417
 
418
  with gr.Column():
419
  ai_text_display = gr.Textbox(
420
  label="πŸ€– Maya's Response",
421
  interactive=False,
422
- lines=3,
423
  placeholder="Maya's response will appear here..."
424
  )
425
 
426
  # Conversation History Section
427
  with gr.Row():
428
  with gr.Column():
429
- gr.HTML("<h3>πŸ“‹ Conversation History</h3>")
430
- history_btn = gr.Button("πŸ“‹ Show History", variant="secondary")
431
  history_display = gr.Markdown(
432
- value="No conversation history yet. Start a call to begin chatting!",
433
  label="Conversation Log"
434
  )
435
 
@@ -455,27 +611,38 @@ def create_interface():
455
  outputs=[history_display]
456
  )
457
 
458
- # Instructions
459
  gr.HTML("""
460
- <div style="margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 10px; border-left: 5px solid #007bff;">
461
- <h3>πŸ’‘ How to Use Maya AI:</h3>
462
- <ol>
463
- <li><strong>Start Call:</strong> Click "πŸ“ž Start Call" to initialize Maya</li>
464
- <li><strong>Record:</strong> Use the microphone to record your message</li>
465
- <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
466
- <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
467
- <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
468
- <li><strong>End:</strong> Click "πŸ“žβŒ End Call" when finished</li>
469
- </ol>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
471
- <h4>🎭 Features:</h4>
472
- <ul>
473
- <li>🎀 <strong>Speech Recognition:</strong> Powered by Whisper</li>
474
- <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B</li>
475
- <li>🎭 <strong>Emotion Detection:</strong> Automatic emotion recognition</li>
476
- <li>πŸ”Š <strong>Natural Speech:</strong> High-quality TTS with emotions</li>
477
- <li>πŸ’­ <strong>Context Memory:</strong> Remembers conversation flow</li>
478
- </ul>
479
  </div>
480
  """)
481
 
@@ -483,16 +650,13 @@ def create_interface():
483
 
484
  if __name__ == "__main__":
485
  print("πŸš€ Initializing Maya AI System...")
486
- print("πŸ”§ Checking GPU availability...")
487
 
488
- if torch.cuda.is_available():
489
- print(f"βœ… GPU detected: {torch.cuda.get_device_name()}")
490
- print(f"πŸ’Ύ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
491
- else:
492
- print("⚠️ No GPU detected, using CPU")
493
 
494
  if load_models():
495
  print("βœ… All models loaded successfully!")
 
496
  print("🌟 Launching Maya AI Interface...")
497
 
498
  demo = create_interface()
 
9
  import time
10
  from datetime import datetime
11
  import os
12
+ import sys
13
 
14
+ # Import with enhanced error handling
15
+ try:
16
+ from dia.model import Dia
17
+ DIA_AVAILABLE = True
18
+ print("βœ… Dia TTS library imported successfully")
19
+ except ImportError as e:
20
+ print(f"⚠️ Dia TTS not available: {e}")
21
+ DIA_AVAILABLE = False
22
+
23
+ # Fallback TTS import
24
  try:
25
  from TTS.api import TTS
26
+ COQUI_TTS_AVAILABLE = True
27
+ print("βœ… Coqui TTS library available as fallback")
28
  except ImportError:
29
+ COQUI_TTS_AVAILABLE = False
30
+ print("⚠️ Coqui TTS not available")
31
 
32
  warnings.filterwarnings("ignore")
33
 
 
36
  qwen_model = None
37
  qwen_tokenizer = None
38
  tts_model = None
39
+ tts_type = None # Track which TTS model is loaded
40
 
41
  class ConversationManager:
42
  def __init__(self, max_exchanges=5):
 
65
  self.history = []
66
  self.current_emotion = "neutral"
67
 
68
+ def check_system_info():
69
+ """Check system capabilities"""
70
+ print("πŸ” System Information:")
71
+ print(f"Python: {sys.version}")
72
+ print(f"PyTorch: {torch.__version__}")
73
+
74
+ if torch.cuda.is_available():
75
+ print(f"βœ… CUDA: {torch.cuda.get_device_name()}")
76
+ print(f"πŸ’Ύ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
77
+ print(f"πŸ”₯ CUDA Version: {torch.version.cuda}")
78
+ else:
79
+ print("⚠️ CUDA not available, using CPU")
80
+
81
  def load_models():
82
+ """Load all models with enhanced error handling"""
83
+ global asr_pipe, qwen_model, qwen_tokenizer, tts_model, tts_type
84
 
85
+ print("πŸš€ Loading Maya AI models...")
86
 
87
+ # Load ASR model (Whisper)
88
  print("🎀 Loading Whisper for ASR...")
89
  try:
90
  asr_pipe = pipeline(
91
  "automatic-speech-recognition",
92
  model="openai/whisper-base",
93
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
94
+ device=0 if torch.cuda.is_available() else -1,
95
+ return_timestamps=False
96
  )
97
  print("βœ… Whisper ASR loaded successfully!")
98
  except Exception as e:
 
111
  model_name,
112
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
113
  device_map="auto" if torch.cuda.is_available() else None,
114
+ trust_remote_code=True,
115
+ low_cpu_mem_usage=True
116
  )
117
  print("βœ… Qwen loaded successfully!")
118
  except Exception as e:
119
  print(f"❌ Error loading Qwen: {e}")
120
  return False
121
 
122
+ # Load TTS model with priority: Dia > Coqui > Text-only
123
  print("πŸŽ™οΈ Loading TTS model...")
124
+
125
+ # Try Dia TTS first (preferred)
126
+ if DIA_AVAILABLE:
127
  try:
128
+ print("Attempting to load Dia TTS...")
129
+ tts_model = Dia.from_pretrained(
130
+ "nari-labs/Dia-1.6B",
131
+ compute_dtype="float16" if torch.cuda.is_available() else "float32"
132
+ )
133
+ tts_type = "dia"
134
+ print("βœ… Dia TTS loaded successfully!")
135
+ return True
136
+ except Exception as e:
137
+ print(f"⚠️ Dia TTS failed to load: {e}")
138
+ tts_model = None
139
+
140
+ # Fallback to Coqui TTS
141
+ if COQUI_TTS_AVAILABLE:
142
+ try:
143
+ print("Attempting to load Coqui TTS as fallback...")
144
  tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
145
  if torch.cuda.is_available():
146
  tts_model = tts_model.to("cuda")
147
+ tts_type = "coqui"
148
+ print("βœ… Coqui TTS loaded successfully!")
149
+ return True
150
  except Exception as e:
151
+ print(f"⚠️ Coqui TTS failed to load: {e}")
152
  tts_model = None
 
 
 
153
 
154
+ # Continue without TTS (text-only mode)
155
+ print("⚠️ No TTS available, running in text-only mode")
156
+ tts_type = "none"
157
  return True
158
 
159
  def detect_emotion_from_text(text):
160
+ """Enhanced emotion detection from text"""
161
  text_lower = text.lower()
162
 
163
+ # Enhanced emotion keywords with weights
164
+ emotions = {
165
+ 'happy': ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing',
166
+ 'fantastic', 'excellent', 'brilliant', 'perfect', 'love', 'joy', 'cheerful',
167
+ 'delighted', 'thrilled', 'ecstatic'],
168
+ 'sad': ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed',
169
+ 'miserable', 'heartbroken', 'devastated', 'gloomy', 'melancholy', 'down',
170
+ 'blue', 'sorrowful'],
171
+ 'angry': ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate', 'rage',
172
+ 'irritated', 'outraged', 'livid', 'enraged', 'pissed', 'irate'],
173
+ 'surprised': ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking',
174
+ 'astonishing', 'remarkable', 'extraordinary', 'mind-blowing',
175
+ 'amazing', 'stunning'],
176
+ 'fearful': ['scared', 'afraid', 'terrified', 'worried', 'anxious', 'nervous',
177
+ 'frightened', 'panic', 'concerned', 'fearful'],
178
+ 'disgusted': ['disgusting', 'gross', 'revolting', 'sick', 'nauseating', 'repulsive',
179
+ 'awful', 'horrible']
180
+ }
181
+
182
+ # Count emotion indicators
183
+ emotion_scores = {}
184
+ for emotion, keywords in emotions.items():
185
+ score = sum(1 for keyword in keywords if keyword in text_lower)
186
+ if score > 0:
187
+ emotion_scores[emotion] = score
188
+
189
+ # Return the emotion with highest score, or neutral if none found
190
+ if emotion_scores:
191
+ return max(emotion_scores, key=emotion_scores.get)
192
+ return 'neutral'
193
 
194
  def speech_to_text_with_emotion(audio_input):
195
+ """Enhanced STT with better audio processing"""
196
  try:
197
  if audio_input is None:
198
  return "", "neutral"
199
 
200
+ # Process audio input with better handling
201
  if isinstance(audio_input, tuple):
202
  sample_rate, audio_data = audio_input
203
+
204
+ # Handle different audio formats
205
+ if audio_data.dtype == np.int16:
206
+ audio_data = audio_data.astype(np.float32) / 32768.0
207
+ elif audio_data.dtype == np.int32:
208
+ audio_data = audio_data.astype(np.float32) / 2147483648.0
209
+ elif audio_data.dtype != np.float32:
210
  audio_data = audio_data.astype(np.float32)
211
+
212
+ # Handle stereo audio
213
  if len(audio_data.shape) > 1:
214
  audio_data = audio_data.mean(axis=1)
215
  else:
216
  audio_data = audio_input
217
  sample_rate = 16000
218
 
219
+ # Validate audio length
220
+ if len(audio_data) < 1600: # Less than 0.1 seconds at 16kHz
221
+ return "Audio too short, please speak longer", "neutral"
222
+
223
  # Normalize audio
224
  if len(audio_data) > 0:
225
  max_val = np.max(np.abs(audio_data))
226
  if max_val > 0:
227
+ audio_data = audio_data / max_val * 0.95
228
 
229
  # Resample to 16kHz if needed
230
  if sample_rate != 16000:
231
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
232
 
233
+ # Speech to text with Whisper
234
  result = asr_pipe(audio_data, sampling_rate=16000)
235
  transcription = result['text'].strip()
236
 
237
+ if not transcription:
238
+ return "No speech detected", "neutral"
239
+
240
  # Detect emotion from transcription
241
  emotion = detect_emotion_from_text(transcription)
242
 
 
247
  return "Sorry, I couldn't understand that.", "neutral"
248
 
249
  def generate_contextual_response(user_input, emotion, conversation_manager):
250
+ """Enhanced response generation with better emotional intelligence"""
251
  try:
252
  context = conversation_manager.get_context()
253
 
254
+ # Enhanced emotional response styles
255
  emotional_prompts = {
256
+ "happy": "Respond with genuine enthusiasm and joy. Use positive language, show excitement, and celebrate with them. Be warm and energetic.",
257
+ "sad": "Respond with deep empathy and comfort. Be gentle, understanding, and supportive. Offer comfort and hope without being dismissive.",
258
+ "angry": "Respond calmly and try to help. Be patient, understanding, and try to de-escalate. Don't match their anger but acknowledge their feelings.",
259
+ "surprised": "Share in their surprise and show curiosity. Be engaging, interested, and ask thoughtful follow-up questions.",
260
+ "fearful": "Respond with reassurance and support. Be calming, protective, and offer practical help or comfort.",
261
+ "disgusted": "Respond with understanding while being helpful. Acknowledge their feelings and try to redirect positively.",
262
+ "neutral": "Respond naturally and conversationally. Be helpful, friendly, and engaging."
263
  }
264
 
265
+ system_prompt = f"""You are Maya, a highly emotionally intelligent AI assistant with a warm, caring personality.
266
+
267
+ {emotional_prompts.get(emotion, emotional_prompts['neutral'])}
268
+
269
+ Previous conversation context:
270
+ {context}
271
+
272
+ Current user emotion detected: {emotion}
273
+
274
+ Guidelines:
275
+ - Keep responses concise but meaningful (1-2 sentences)
276
+ - Match the user's emotional tone appropriately
277
+ - Be natural and conversational
278
+ - Show genuine empathy and understanding
279
+ - Provide helpful and relevant responses
280
+ - Use natural speech patterns
281
+ - If they seem distressed, offer support
282
+ - If they're happy, celebrate with them
283
+ """
284
 
285
  messages = [
286
  {"role": "system", "content": system_prompt},
287
  {"role": "user", "content": user_input}
288
  ]
289
 
290
+ # Generate response with Qwen
291
  text = qwen_tokenizer.apply_chat_template(
292
  messages,
293
  tokenize=False,
 
301
  with torch.no_grad():
302
  generated_ids = qwen_model.generate(
303
  model_inputs.input_ids,
304
+ max_new_tokens=100,
305
  do_sample=True,
306
  temperature=0.7,
307
  top_p=0.9,
308
+ repetition_penalty=1.1,
309
  pad_token_id=qwen_tokenizer.eos_token_id
310
  )
311
 
 
315
 
316
  response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
317
 
318
+ # Clean up response
319
+ response = response.strip()
320
+ if response.startswith("Maya:"):
321
+ response = response[5:].strip()
322
+
323
+ return response
324
 
325
  except Exception as e:
326
  print(f"Error in response generation: {e}")
327
  return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
328
 
329
  def text_to_speech_emotional(text, emotion="neutral"):
330
+ """Enhanced TTS with support for both Dia and Coqui"""
331
  try:
332
  if tts_model is None:
333
  print(f"πŸ”Š Maya says ({emotion}): {text}")
 
337
  if torch.cuda.is_available():
338
  torch.cuda.empty_cache()
339
 
340
+ if tts_type == "dia":
341
+ # Dia TTS with enhanced emotional markers
342
+ emotional_markers = {
343
+ "happy": "(excited) ",
344
+ "sad": "(sad) ",
345
+ "angry": "(calm) ", # Stay calm when user is angry
346
+ "surprised": "(surprised) ",
347
+ "fearful": "(reassuring) ",
348
+ "disgusted": "(understanding) ",
349
+ "neutral": ""
350
+ }
351
+
352
+ # Enhanced text processing for Dia
353
+ enhanced_text = f"[S1] {emotional_markers.get(emotion, '')}{text}"
354
+
355
+ # Add natural pauses for longer text
356
+ if len(text) > 50:
357
+ enhanced_text = enhanced_text.replace(". ", ". (pause) ")
358
+ enhanced_text = enhanced_text.replace("! ", "! (pause) ")
359
+ enhanced_text = enhanced_text.replace("? ", "? (pause) ")
360
+
361
+ print(f"Generating Dia TTS for: {enhanced_text}")
362
+
363
+ with torch.no_grad():
364
+ audio_output = tts_model.generate(
365
+ enhanced_text,
366
+ use_torch_compile=False,
367
+ verbose=False
368
+ )
369
+
370
+ # Process Dia output
371
+ if isinstance(audio_output, torch.Tensor):
372
+ audio_output = audio_output.cpu().numpy()
373
+
374
+ # Normalize audio
375
+ if len(audio_output) > 0:
376
+ max_val = np.max(np.abs(audio_output))
377
+ if max_val > 1.0:
378
+ audio_output = audio_output / max_val * 0.95
379
+
380
+ return (44100, audio_output)
381
+
382
+ elif tts_type == "coqui":
383
+ # Coqui TTS processing
384
+ emotional_prefixes = {
385
+ "happy": "[Speaking with joy] ",
386
+ "sad": "[Speaking gently] ",
387
+ "angry": "[Speaking calmly] ",
388
+ "surprised": "[Speaking with excitement] ",
389
+ "fearful": "[Speaking reassuringly] ",
390
+ "disgusted": "[Speaking understandingly] ",
391
+ "neutral": ""
392
+ }
393
+
394
+ enhanced_text = f"{emotional_prefixes.get(emotion, '')}{text}"
395
+
396
+ print(f"Generating Coqui TTS for: {enhanced_text}")
397
+
398
+ audio_output = tts_model.tts(text=enhanced_text)
399
+
400
+ # Convert to numpy array if needed
401
+ if isinstance(audio_output, list):
402
+ audio_output = np.array(audio_output, dtype=np.float32)
403
+ elif torch.is_tensor(audio_output):
404
+ audio_output = audio_output.cpu().numpy().astype(np.float32)
405
+
406
+ # Normalize audio
407
+ if len(audio_output) > 0:
408
+ max_val = np.max(np.abs(audio_output))
409
+ if max_val > 1.0:
410
+ audio_output = audio_output / max_val * 0.95
411
+
412
+ return (22050, audio_output)
413
 
414
+ else:
415
+ # Text-only mode
416
+ print(f"πŸ”Š Maya says ({emotion}): {text}")
417
+ return None
418
+
419
  except Exception as e:
420
  print(f"Error in TTS: {e}")
421
  print(f"πŸ”Š Maya says ({emotion}): {text}")
 
427
  def start_call():
428
  """Initialize call and return greeting"""
429
  conv_manager.clear()
430
+ greeting_text = "Hello! I'm Maya, your AI assistant. I'm here to chat and help you with anything you need. How are you feeling today?"
431
  greeting_audio = text_to_speech_emotional(greeting_text, "happy")
432
 
433
+ tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
434
+ return greeting_audio, greeting_text, f"πŸ“ž Call started! Maya is ready to chat. {tts_status}"
435
 
436
  def process_conversation(audio_input):
437
+ """Enhanced conversation processing pipeline"""
438
  if audio_input is None:
439
  return None, "Please record some audio first.", "", "❌ No audio input received."
440
 
 
442
  # Step 1: Speech to Text + Emotion Detection
443
  user_text, emotion = speech_to_text_with_emotion(audio_input)
444
 
445
+ if not user_text or user_text.strip() == "" or "sorry" in user_text.lower():
446
+ return None, "I didn't catch that clearly. Could you please speak a bit louder or closer to the microphone?", "", "❌ No clear speech detected."
447
 
448
  # Step 2: Generate contextual response
449
  ai_response = generate_contextual_response(user_text, emotion, conv_manager)
 
454
  # Step 4: Update conversation history
455
  conv_manager.add_exchange(user_text, ai_response, emotion)
456
 
457
+ status = f"βœ… Processed successfully! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5 | TTS: {tts_type.upper()}"
458
 
459
  return response_audio, ai_response, user_text, status
460
 
461
  except Exception as e:
462
  error_msg = f"❌ Error processing conversation: {str(e)}"
463
+ print(error_msg)
464
  return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
465
 
466
  def get_conversation_history():
467
  """Return formatted conversation history"""
468
  if not conv_manager.history:
469
+ return "No conversation history yet. Start a call to begin chatting with Maya!"
470
 
471
  history_text = "πŸ“‹ **Conversation History:**\n\n"
472
  for i, exchange in enumerate(conv_manager.history, 1):
 
479
 
480
  def end_call():
481
  """End call and clear conversation"""
482
+ farewell_text = "Thank you for our wonderful conversation! I really enjoyed talking with you. Take care and have an amazing day!"
483
  farewell_audio = text_to_speech_emotional(farewell_text, "happy")
484
  conv_manager.clear()
485
 
486
+ return farewell_audio, farewell_text, "πŸ“žβŒ Call ended. Thank you for chatting with Maya!"
487
 
488
  def create_interface():
489
+ """Create enhanced Gradio interface"""
490
  with gr.Blocks(
491
+ title="Maya AI - Advanced Speech-to-Speech Assistant",
492
  theme=gr.themes.Soft(),
493
  css="""
494
  .main-header {
495
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
496
  border-radius: 15px;
497
+ padding: 25px;
498
  text-align: center;
499
+ margin-bottom: 25px;
500
+ box-shadow: 0 8px 32px rgba(0,0,0,0.1);
501
+ }
502
+ .call-button {
503
+ background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
504
+ border: none !important;
505
+ box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
506
+ }
507
+ .process-button {
508
+ background: linear-gradient(45deg, #45B7D1, #96CEB4) !important;
509
+ border: none !important;
510
+ box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
511
+ }
512
+ .end-button {
513
+ background: linear-gradient(45deg, #FFA07A, #FF6347) !important;
514
+ border: none !important;
515
+ box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
516
  }
 
 
 
517
  """
518
  ) as demo:
519
 
520
  gr.HTML("""
521
  <div class="main-header">
522
+ <h1 style="color: white; margin: 0; font-size: 2.8em; font-weight: bold;">πŸŽ™οΈ Maya AI</h1>
523
+ <p style="color: white; margin: 15px 0; font-size: 1.3em;">Advanced Speech-to-Speech Conversational AI</p>
524
+ <p style="color: #E8E8E8; margin: 0; font-size: 1.1em;">Natural β€’ Emotional β€’ Contextual β€’ Intelligent</p>
525
  </div>
526
  """)
527
 
528
  with gr.Row():
529
  with gr.Column(scale=1):
530
  # Call Controls
531
+ gr.HTML("<h3 style='color: #333; margin-bottom: 15px;'>πŸ“ž Call Controls</h3>")
532
  start_btn = gr.Button("πŸ“ž Start Call", elem_classes="call-button", size="lg")
533
  end_btn = gr.Button("πŸ“žβŒ End Call", elem_classes="end-button", size="lg")
534
 
535
  # Audio Input
536
+ gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>🎀 Voice Input</h3>")
537
  audio_input = gr.Audio(
538
  label="Record Your Message",
539
  sources=["microphone"],
540
+ type="numpy",
541
+ format="wav"
542
  )
543
 
544
  process_btn = gr.Button("🎯 Process Message", elem_classes="process-button", variant="primary", size="lg")
545
 
546
  # Status Display
547
  status_display = gr.Textbox(
548
+ label="πŸ“Š System Status",
549
  interactive=False,
550
+ lines=3,
551
+ value="πŸš€ System ready! Click 'Start Call' to begin your conversation with Maya."
552
  )
553
 
554
  with gr.Column(scale=2):
555
  # AI Response Audio
556
+ gr.HTML("<h3 style='color: #333; margin-bottom: 15px;'>πŸ”Š Maya's Response</h3>")
557
  response_audio = gr.Audio(
558
  label="Maya's Voice Response",
559
  type="numpy",
560
+ interactive=False,
561
+ autoplay=True
562
  )
563
 
564
  # Text Displays
 
567
  user_text_display = gr.Textbox(
568
  label="πŸ‘€ What You Said",
569
  interactive=False,
570
+ lines=4,
571
+ placeholder="Your speech will appear here after processing..."
572
  )
573
 
574
  with gr.Column():
575
  ai_text_display = gr.Textbox(
576
  label="πŸ€– Maya's Response",
577
  interactive=False,
578
+ lines=4,
579
  placeholder="Maya's response will appear here..."
580
  )
581
 
582
  # Conversation History Section
583
  with gr.Row():
584
  with gr.Column():
585
+ gr.HTML("<h3 style='color: #333; margin: 25px 0 15px 0;'>πŸ“‹ Conversation History</h3>")
586
+ history_btn = gr.Button("πŸ“‹ Show Conversation History", variant="secondary", size="lg")
587
  history_display = gr.Markdown(
588
+ value="No conversation history yet. Start a call to begin chatting with Maya!",
589
  label="Conversation Log"
590
  )
591
 
 
611
  outputs=[history_display]
612
  )
613
 
614
+ # Enhanced Instructions
615
  gr.HTML("""
616
+ <div style="margin-top: 30px; padding: 25px; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); border-radius: 15px; border: 1px solid #dee2e6;">
617
+ <h3 style="color: #495057; margin-bottom: 20px;">πŸ’‘ How to Use Maya AI:</h3>
618
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
619
+ <div>
620
+ <h4 style="color: #007bff;">πŸš€ Getting Started:</h4>
621
+ <ol style="color: #495057;">
622
+ <li><strong>Start Call:</strong> Click "πŸ“ž Start Call" to initialize Maya</li>
623
+ <li><strong>Record:</strong> Use the microphone to record your message</li>
624
+ <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
625
+ <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
626
+ <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
627
+ <li><strong>End:</strong> Click "πŸ“žβŒ End Call" when finished</li>
628
+ </ol>
629
+ </div>
630
+ <div>
631
+ <h4 style="color: #28a745;">🎭 Advanced Features:</h4>
632
+ <ul style="color: #495057;">
633
+ <li>🎀 <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
634
+ <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
635
+ <li>🎭 <strong>Emotion Detection:</strong> Advanced emotion recognition from speech</li>
636
+ <li>πŸ”Š <strong>Natural TTS:</strong> High-quality speech synthesis with Dia TTS</li>
637
+ <li>πŸ’­ <strong>Context Memory:</strong> Remembers conversation flow and context</li>
638
+ <li>❀️ <strong>Emotional Intelligence:</strong> Responds appropriately to your emotions</li>
639
+ </ul>
640
+ </div>
641
+ </div>
642
 
643
+ <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px; border-left: 4px solid #bee5eb;">
644
+ <p style="margin: 0; color: #0c5460;"><strong>πŸ’‘ Pro Tip:</strong> Speak clearly and naturally. Maya can detect emotions like happiness, sadness, anger, surprise, fear, and disgust, and will respond accordingly to provide the best conversational experience!</p>
645
+ </div>
 
 
 
 
 
646
  </div>
647
  """)
648
 
 
650
 
651
  if __name__ == "__main__":
652
  print("πŸš€ Initializing Maya AI System...")
 
653
 
654
+ # Check system info
655
+ check_system_info()
 
 
 
656
 
657
  if load_models():
658
  print("βœ… All models loaded successfully!")
659
+ print(f"πŸŽ™οΈ TTS Mode: {tts_type.upper()}")
660
  print("🌟 Launching Maya AI Interface...")
661
 
662
  demo = create_interface()