Devakumar868 commited on
Commit
e4a1156
Β·
verified Β·
1 Parent(s): 43f8b09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -57
app.py CHANGED
@@ -83,7 +83,7 @@ def check_system_info():
83
  print("⚠️ CUDA not available, using CPU")
84
 
85
  def load_models():
86
- """Load all models with enhanced memory management"""
87
  global asr_pipe, qwen_model, qwen_tokenizer, tts_model, tts_type
88
 
89
  print("πŸš€ Loading Maya AI models...")
@@ -104,7 +104,7 @@ def load_models():
104
  print(f"❌ Error loading Whisper: {e}")
105
  return False
106
 
107
- # Load Qwen model with memory optimization
108
  print("🧠 Loading Qwen2.5-1.5B for conversation...")
109
  try:
110
  model_name = "Qwen/Qwen2.5-1.5B-Instruct"
@@ -116,9 +116,7 @@ def load_models():
116
  model_name,
117
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
118
  device_map="auto" if torch.cuda.is_available() else None,
119
- trust_remote_code=True,
120
- low_cpu_mem_usage=True,
121
- max_memory={0: "6GB"} if torch.cuda.is_available() else None # Limit Qwen memory
122
  )
123
  print("βœ… Qwen loaded successfully!")
124
  optimize_gpu_memory()
@@ -126,18 +124,19 @@ def load_models():
126
  print(f"❌ Error loading Qwen: {e}")
127
  return False
128
 
129
- # Load Dia TTS with optimized settings
130
  if DIA_AVAILABLE:
131
  try:
132
- print("Attempting to load Dia TTS with optimized settings...")
133
 
134
  # Clear memory before loading Dia
135
  optimize_gpu_memory()
136
 
 
137
  tts_model = Dia.from_pretrained(
138
  "nari-labs/Dia-1.6B",
139
- compute_dtype="float16" if torch.cuda.is_available() else "float32",
140
- low_cpu_mem_usage=True
141
  )
142
 
143
  # Move to GPU if available
@@ -227,7 +226,7 @@ def speech_to_text_with_emotion(audio_input):
227
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
228
 
229
  print("πŸ”„ Running Whisper ASR...")
230
- result = asr_pipe(audio_data, language='en') # Force English to avoid language detection
231
 
232
  transcription = result['text'].strip()
233
  print(f"Transcription: '{transcription}'")
@@ -247,7 +246,6 @@ def speech_to_text_with_emotion(audio_input):
247
  def generate_contextual_response(user_input, emotion, conversation_manager):
248
  """Enhanced response generation with memory optimization"""
249
  try:
250
- # Clear GPU cache before generation
251
  optimize_gpu_memory()
252
 
253
  context = conversation_manager.get_context()
@@ -290,13 +288,13 @@ Guidelines:
290
  with torch.no_grad():
291
  generated_ids = qwen_model.generate(
292
  model_inputs.input_ids,
293
- max_new_tokens=50, # Reduced for shorter responses
294
  do_sample=True,
295
  temperature=0.7,
296
  top_p=0.9,
297
  repetition_penalty=1.1,
298
  pad_token_id=qwen_tokenizer.eos_token_id,
299
- attention_mask=model_inputs.attention_mask # Fix attention mask warning
300
  )
301
 
302
  generated_ids = [
@@ -309,7 +307,6 @@ Guidelines:
309
  if response.startswith("Maya:"):
310
  response = response[5:].strip()
311
 
312
- # Clear cache after generation
313
  optimize_gpu_memory()
314
 
315
  return response
@@ -319,32 +316,18 @@ Guidelines:
319
  return "I'm sorry, I'm having trouble processing that right now."
320
 
321
  def text_to_speech_emotional(text, emotion="neutral"):
322
- """FIXED TTS with enhanced Dia configuration and memory management"""
323
  try:
324
  if tts_model is None:
325
  print(f"πŸ”Š Maya says ({emotion}): {text}")
326
  return None
327
 
328
- # Aggressive memory cleanup before TTS
329
  optimize_gpu_memory()
330
 
331
  if tts_type == "dia":
332
- # Simplified emotional markers for better audio quality
333
- emotional_markers = {
334
- "happy": "", # Remove complex markers that might cause artifacts
335
- "sad": "",
336
- "angry": "",
337
- "surprised": "",
338
- "neutral": ""
339
- }
340
-
341
- # Simplified text processing for Dia - NO COMPLEX MARKERS
342
- # Keep it simple to avoid audio artifacts
343
  enhanced_text = f"[S1] {text}"
344
 
345
- # Remove pauses that might cause artifacts
346
- # enhanced_text = enhanced_text.replace("(pause)", "")
347
-
348
  # Limit text length to prevent memory issues
349
  if len(enhanced_text) > 200:
350
  enhanced_text = enhanced_text[:200] + "..."
@@ -353,14 +336,10 @@ def text_to_speech_emotional(text, emotion="neutral"):
353
 
354
  try:
355
  with torch.no_grad():
356
- # Use more conservative settings for T4
357
  audio_output = tts_model.generate(
358
  enhanced_text,
359
  use_torch_compile=False,
360
- verbose=False,
361
- # Add these parameters for better quality
362
- temperature=0.7,
363
- top_p=0.9
364
  )
365
 
366
  # Enhanced audio processing
@@ -371,30 +350,28 @@ def text_to_speech_emotional(text, emotion="neutral"):
371
  if len(audio_output.shape) > 1:
372
  audio_output = audio_output.squeeze()
373
 
374
- # More conservative normalization
375
  if len(audio_output) > 0:
376
  # Remove DC offset
377
  audio_output = audio_output - np.mean(audio_output)
378
 
379
- # Gentle normalization to prevent clipping
380
  max_val = np.max(np.abs(audio_output))
381
  if max_val > 0:
382
- audio_output = audio_output / max_val * 0.8 # More conservative scaling
383
 
384
  # Ensure correct data type
385
  audio_output = audio_output.astype(np.float32)
386
 
387
  # Validate audio output
388
  if np.any(np.isnan(audio_output)) or np.any(np.isinf(audio_output)):
389
- print("❌ Audio contains NaN or Inf values, regenerating...")
390
  return None
391
 
392
  print(f"βœ… Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
393
 
394
- # Clear memory after generation
395
  optimize_gpu_memory()
396
 
397
- # Return audio with correct sample rate for Dia
398
  return (44100, audio_output)
399
 
400
  except Exception as e:
@@ -420,14 +397,14 @@ def start_call():
420
  conv_manager.clear()
421
  optimize_gpu_memory()
422
 
423
- greeting_text = "Hello! I'm Maya. How can I help you today?" # Shorter greeting
424
  greeting_audio = text_to_speech_emotional(greeting_text, "happy")
425
 
426
  tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
427
  return greeting_audio, greeting_text, f"πŸ“ž Call started! Maya is ready. {tts_status}"
428
 
429
  def process_conversation(audio_input):
430
- """Main conversation processing pipeline with memory management"""
431
  if audio_input is None:
432
  return None, "Please record some audio first.", "", "❌ No audio input received."
433
 
@@ -494,7 +471,7 @@ def end_call():
494
  return farewell_audio, farewell_text, "πŸ“žβŒ Call ended. Thank you!"
495
 
496
  def create_interface():
497
- """Create Gradio interface with enhanced audio settings"""
498
  with gr.Blocks(
499
  title="Maya AI - Speech-to-Speech Assistant",
500
  theme=gr.themes.Soft()
@@ -532,18 +509,12 @@ def create_interface():
532
 
533
  with gr.Column(scale=2):
534
  gr.HTML("<h3>πŸ”Š Maya's Response</h3>")
535
- # Enhanced audio component with better settings
536
  response_audio = gr.Audio(
537
  label="Maya's Voice Response",
538
  type="numpy",
539
  interactive=False,
540
  autoplay=True,
541
- show_download_button=True,
542
- show_share_button=False,
543
- waveform_options=gr.WaveformOptions(
544
- waveform_color="#01C6FF",
545
- waveform_progress_color="#0066CC"
546
- )
547
  )
548
 
549
  with gr.Row():
@@ -589,7 +560,7 @@ def create_interface():
589
  outputs=[history_display]
590
  )
591
 
592
- # Enhanced instructions
593
  gr.HTML("""
594
  <div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
595
  <h3>πŸ’‘ How to Use Maya AI:</h3>
@@ -603,12 +574,12 @@ def create_interface():
603
  </ol>
604
 
605
  <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
606
- <p><strong>πŸ”§ Troubleshooting Audio Issues:</strong></p>
607
  <ul>
608
- <li>If audio sounds weird, try refreshing the page</li>
609
- <li>Use the download button to save and test audio files</li>
610
- <li>Speak in a quiet environment for best results</li>
611
- <li>Keep responses short for better audio quality</li>
612
  </ul>
613
  </div>
614
  </div>
 
83
  print("⚠️ CUDA not available, using CPU")
84
 
85
  def load_models():
86
+ """Load all models with FIXED Dia loading"""
87
  global asr_pipe, qwen_model, qwen_tokenizer, tts_model, tts_type
88
 
89
  print("πŸš€ Loading Maya AI models...")
 
104
  print(f"❌ Error loading Whisper: {e}")
105
  return False
106
 
107
+ # Load Qwen model
108
  print("🧠 Loading Qwen2.5-1.5B for conversation...")
109
  try:
110
  model_name = "Qwen/Qwen2.5-1.5B-Instruct"
 
116
  model_name,
117
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
118
  device_map="auto" if torch.cuda.is_available() else None,
119
+ trust_remote_code=True
 
 
120
  )
121
  print("βœ… Qwen loaded successfully!")
122
  optimize_gpu_memory()
 
124
  print(f"❌ Error loading Qwen: {e}")
125
  return False
126
 
127
+ # FIXED: Load Dia TTS without unsupported parameters
128
  if DIA_AVAILABLE:
129
  try:
130
+ print("Attempting to load Dia TTS with FIXED parameters...")
131
 
132
  # Clear memory before loading Dia
133
  optimize_gpu_memory()
134
 
135
+ # FIXED: Remove unsupported parameters
136
  tts_model = Dia.from_pretrained(
137
  "nari-labs/Dia-1.6B",
138
+ compute_dtype="float16" if torch.cuda.is_available() else "float32"
139
+ # Removed: low_cpu_mem_usage=True (not supported by Dia)
140
  )
141
 
142
  # Move to GPU if available
 
226
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
227
 
228
  print("πŸ”„ Running Whisper ASR...")
229
+ result = asr_pipe(audio_data, language='en') # Force English
230
 
231
  transcription = result['text'].strip()
232
  print(f"Transcription: '{transcription}'")
 
246
  def generate_contextual_response(user_input, emotion, conversation_manager):
247
  """Enhanced response generation with memory optimization"""
248
  try:
 
249
  optimize_gpu_memory()
250
 
251
  context = conversation_manager.get_context()
 
288
  with torch.no_grad():
289
  generated_ids = qwen_model.generate(
290
  model_inputs.input_ids,
291
+ max_new_tokens=50,
292
  do_sample=True,
293
  temperature=0.7,
294
  top_p=0.9,
295
  repetition_penalty=1.1,
296
  pad_token_id=qwen_tokenizer.eos_token_id,
297
+ attention_mask=model_inputs.attention_mask
298
  )
299
 
300
  generated_ids = [
 
307
  if response.startswith("Maya:"):
308
  response = response[5:].strip()
309
 
 
310
  optimize_gpu_memory()
311
 
312
  return response
 
316
  return "I'm sorry, I'm having trouble processing that right now."
317
 
318
  def text_to_speech_emotional(text, emotion="neutral"):
319
+ """FIXED TTS with proper Dia configuration"""
320
  try:
321
  if tts_model is None:
322
  print(f"πŸ”Š Maya says ({emotion}): {text}")
323
  return None
324
 
 
325
  optimize_gpu_memory()
326
 
327
  if tts_type == "dia":
328
+ # Simplified text processing for Dia
 
 
 
 
 
 
 
 
 
 
329
  enhanced_text = f"[S1] {text}"
330
 
 
 
 
331
  # Limit text length to prevent memory issues
332
  if len(enhanced_text) > 200:
333
  enhanced_text = enhanced_text[:200] + "..."
 
336
 
337
  try:
338
  with torch.no_grad():
 
339
  audio_output = tts_model.generate(
340
  enhanced_text,
341
  use_torch_compile=False,
342
+ verbose=False
 
 
 
343
  )
344
 
345
  # Enhanced audio processing
 
350
  if len(audio_output.shape) > 1:
351
  audio_output = audio_output.squeeze()
352
 
353
+ # Conservative normalization
354
  if len(audio_output) > 0:
355
  # Remove DC offset
356
  audio_output = audio_output - np.mean(audio_output)
357
 
358
+ # Gentle normalization
359
  max_val = np.max(np.abs(audio_output))
360
  if max_val > 0:
361
+ audio_output = audio_output / max_val * 0.8
362
 
363
  # Ensure correct data type
364
  audio_output = audio_output.astype(np.float32)
365
 
366
  # Validate audio output
367
  if np.any(np.isnan(audio_output)) or np.any(np.isinf(audio_output)):
368
+ print("❌ Audio contains NaN or Inf values")
369
  return None
370
 
371
  print(f"βœ… Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
372
 
 
373
  optimize_gpu_memory()
374
 
 
375
  return (44100, audio_output)
376
 
377
  except Exception as e:
 
397
  conv_manager.clear()
398
  optimize_gpu_memory()
399
 
400
+ greeting_text = "Hello! I'm Maya. How can I help you today?"
401
  greeting_audio = text_to_speech_emotional(greeting_text, "happy")
402
 
403
  tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
404
  return greeting_audio, greeting_text, f"πŸ“ž Call started! Maya is ready. {tts_status}"
405
 
406
  def process_conversation(audio_input):
407
+ """Main conversation processing pipeline"""
408
  if audio_input is None:
409
  return None, "Please record some audio first.", "", "❌ No audio input received."
410
 
 
471
  return farewell_audio, farewell_text, "πŸ“žβŒ Call ended. Thank you!"
472
 
473
  def create_interface():
474
+ """Create Gradio interface"""
475
  with gr.Blocks(
476
  title="Maya AI - Speech-to-Speech Assistant",
477
  theme=gr.themes.Soft()
 
509
 
510
  with gr.Column(scale=2):
511
  gr.HTML("<h3>πŸ”Š Maya's Response</h3>")
 
512
  response_audio = gr.Audio(
513
  label="Maya's Voice Response",
514
  type="numpy",
515
  interactive=False,
516
  autoplay=True,
517
+ show_download_button=True
 
 
 
 
 
518
  )
519
 
520
  with gr.Row():
 
560
  outputs=[history_display]
561
  )
562
 
563
+ # Instructions
564
  gr.HTML("""
565
  <div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
566
  <h3>πŸ’‘ How to Use Maya AI:</h3>
 
574
  </ol>
575
 
576
  <div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
577
+ <p><strong>πŸ”§ Fixed Issues:</strong></p>
578
  <ul>
579
+ <li>βœ… Pydantic version pinned to 2.10.6 (fixes Gradio crash)</li>
580
+ <li>βœ… Dia TTS loading parameters corrected</li>
581
+ <li>βœ… Memory optimization for T4 GPU</li>
582
+ <li>βœ… Audio processing enhanced</li>
583
  </ul>
584
  </div>
585
  </div>