Update app.py
Browse files
app.py
CHANGED
@@ -83,7 +83,7 @@ def check_system_info():
|
|
83 |
print("β οΈ CUDA not available, using CPU")
|
84 |
|
85 |
def load_models():
|
86 |
-
"""Load all models with
|
87 |
global asr_pipe, qwen_model, qwen_tokenizer, tts_model, tts_type
|
88 |
|
89 |
print("π Loading Maya AI models...")
|
@@ -104,7 +104,7 @@ def load_models():
|
|
104 |
print(f"β Error loading Whisper: {e}")
|
105 |
return False
|
106 |
|
107 |
-
# Load Qwen model
|
108 |
print("π§ Loading Qwen2.5-1.5B for conversation...")
|
109 |
try:
|
110 |
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
|
@@ -116,9 +116,7 @@ def load_models():
|
|
116 |
model_name,
|
117 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
118 |
device_map="auto" if torch.cuda.is_available() else None,
|
119 |
-
trust_remote_code=True
|
120 |
-
low_cpu_mem_usage=True,
|
121 |
-
max_memory={0: "6GB"} if torch.cuda.is_available() else None # Limit Qwen memory
|
122 |
)
|
123 |
print("β
Qwen loaded successfully!")
|
124 |
optimize_gpu_memory()
|
@@ -126,18 +124,19 @@ def load_models():
|
|
126 |
print(f"β Error loading Qwen: {e}")
|
127 |
return False
|
128 |
|
129 |
-
# Load Dia TTS
|
130 |
if DIA_AVAILABLE:
|
131 |
try:
|
132 |
-
print("Attempting to load Dia TTS with
|
133 |
|
134 |
# Clear memory before loading Dia
|
135 |
optimize_gpu_memory()
|
136 |
|
|
|
137 |
tts_model = Dia.from_pretrained(
|
138 |
"nari-labs/Dia-1.6B",
|
139 |
-
compute_dtype="float16" if torch.cuda.is_available() else "float32"
|
140 |
-
low_cpu_mem_usage=True
|
141 |
)
|
142 |
|
143 |
# Move to GPU if available
|
@@ -227,7 +226,7 @@ def speech_to_text_with_emotion(audio_input):
|
|
227 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
228 |
|
229 |
print("π Running Whisper ASR...")
|
230 |
-
result = asr_pipe(audio_data, language='en') # Force English
|
231 |
|
232 |
transcription = result['text'].strip()
|
233 |
print(f"Transcription: '{transcription}'")
|
@@ -247,7 +246,6 @@ def speech_to_text_with_emotion(audio_input):
|
|
247 |
def generate_contextual_response(user_input, emotion, conversation_manager):
|
248 |
"""Enhanced response generation with memory optimization"""
|
249 |
try:
|
250 |
-
# Clear GPU cache before generation
|
251 |
optimize_gpu_memory()
|
252 |
|
253 |
context = conversation_manager.get_context()
|
@@ -290,13 +288,13 @@ Guidelines:
|
|
290 |
with torch.no_grad():
|
291 |
generated_ids = qwen_model.generate(
|
292 |
model_inputs.input_ids,
|
293 |
-
max_new_tokens=50,
|
294 |
do_sample=True,
|
295 |
temperature=0.7,
|
296 |
top_p=0.9,
|
297 |
repetition_penalty=1.1,
|
298 |
pad_token_id=qwen_tokenizer.eos_token_id,
|
299 |
-
attention_mask=model_inputs.attention_mask
|
300 |
)
|
301 |
|
302 |
generated_ids = [
|
@@ -309,7 +307,6 @@ Guidelines:
|
|
309 |
if response.startswith("Maya:"):
|
310 |
response = response[5:].strip()
|
311 |
|
312 |
-
# Clear cache after generation
|
313 |
optimize_gpu_memory()
|
314 |
|
315 |
return response
|
@@ -319,32 +316,18 @@ Guidelines:
|
|
319 |
return "I'm sorry, I'm having trouble processing that right now."
|
320 |
|
321 |
def text_to_speech_emotional(text, emotion="neutral"):
|
322 |
-
"""FIXED TTS with
|
323 |
try:
|
324 |
if tts_model is None:
|
325 |
print(f"π Maya says ({emotion}): {text}")
|
326 |
return None
|
327 |
|
328 |
-
# Aggressive memory cleanup before TTS
|
329 |
optimize_gpu_memory()
|
330 |
|
331 |
if tts_type == "dia":
|
332 |
-
# Simplified
|
333 |
-
emotional_markers = {
|
334 |
-
"happy": "", # Remove complex markers that might cause artifacts
|
335 |
-
"sad": "",
|
336 |
-
"angry": "",
|
337 |
-
"surprised": "",
|
338 |
-
"neutral": ""
|
339 |
-
}
|
340 |
-
|
341 |
-
# Simplified text processing for Dia - NO COMPLEX MARKERS
|
342 |
-
# Keep it simple to avoid audio artifacts
|
343 |
enhanced_text = f"[S1] {text}"
|
344 |
|
345 |
-
# Remove pauses that might cause artifacts
|
346 |
-
# enhanced_text = enhanced_text.replace("(pause)", "")
|
347 |
-
|
348 |
# Limit text length to prevent memory issues
|
349 |
if len(enhanced_text) > 200:
|
350 |
enhanced_text = enhanced_text[:200] + "..."
|
@@ -353,14 +336,10 @@ def text_to_speech_emotional(text, emotion="neutral"):
|
|
353 |
|
354 |
try:
|
355 |
with torch.no_grad():
|
356 |
-
# Use more conservative settings for T4
|
357 |
audio_output = tts_model.generate(
|
358 |
enhanced_text,
|
359 |
use_torch_compile=False,
|
360 |
-
verbose=False
|
361 |
-
# Add these parameters for better quality
|
362 |
-
temperature=0.7,
|
363 |
-
top_p=0.9
|
364 |
)
|
365 |
|
366 |
# Enhanced audio processing
|
@@ -371,30 +350,28 @@ def text_to_speech_emotional(text, emotion="neutral"):
|
|
371 |
if len(audio_output.shape) > 1:
|
372 |
audio_output = audio_output.squeeze()
|
373 |
|
374 |
-
#
|
375 |
if len(audio_output) > 0:
|
376 |
# Remove DC offset
|
377 |
audio_output = audio_output - np.mean(audio_output)
|
378 |
|
379 |
-
# Gentle normalization
|
380 |
max_val = np.max(np.abs(audio_output))
|
381 |
if max_val > 0:
|
382 |
-
audio_output = audio_output / max_val * 0.8
|
383 |
|
384 |
# Ensure correct data type
|
385 |
audio_output = audio_output.astype(np.float32)
|
386 |
|
387 |
# Validate audio output
|
388 |
if np.any(np.isnan(audio_output)) or np.any(np.isinf(audio_output)):
|
389 |
-
print("β Audio contains NaN or Inf values
|
390 |
return None
|
391 |
|
392 |
print(f"β
Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
|
393 |
|
394 |
-
# Clear memory after generation
|
395 |
optimize_gpu_memory()
|
396 |
|
397 |
-
# Return audio with correct sample rate for Dia
|
398 |
return (44100, audio_output)
|
399 |
|
400 |
except Exception as e:
|
@@ -420,14 +397,14 @@ def start_call():
|
|
420 |
conv_manager.clear()
|
421 |
optimize_gpu_memory()
|
422 |
|
423 |
-
greeting_text = "Hello! I'm Maya. How can I help you today?"
|
424 |
greeting_audio = text_to_speech_emotional(greeting_text, "happy")
|
425 |
|
426 |
tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
|
427 |
return greeting_audio, greeting_text, f"π Call started! Maya is ready. {tts_status}"
|
428 |
|
429 |
def process_conversation(audio_input):
|
430 |
-
"""Main conversation processing pipeline
|
431 |
if audio_input is None:
|
432 |
return None, "Please record some audio first.", "", "β No audio input received."
|
433 |
|
@@ -494,7 +471,7 @@ def end_call():
|
|
494 |
return farewell_audio, farewell_text, "πβ Call ended. Thank you!"
|
495 |
|
496 |
def create_interface():
|
497 |
-
"""Create Gradio interface
|
498 |
with gr.Blocks(
|
499 |
title="Maya AI - Speech-to-Speech Assistant",
|
500 |
theme=gr.themes.Soft()
|
@@ -532,18 +509,12 @@ def create_interface():
|
|
532 |
|
533 |
with gr.Column(scale=2):
|
534 |
gr.HTML("<h3>π Maya's Response</h3>")
|
535 |
-
# Enhanced audio component with better settings
|
536 |
response_audio = gr.Audio(
|
537 |
label="Maya's Voice Response",
|
538 |
type="numpy",
|
539 |
interactive=False,
|
540 |
autoplay=True,
|
541 |
-
show_download_button=True
|
542 |
-
show_share_button=False,
|
543 |
-
waveform_options=gr.WaveformOptions(
|
544 |
-
waveform_color="#01C6FF",
|
545 |
-
waveform_progress_color="#0066CC"
|
546 |
-
)
|
547 |
)
|
548 |
|
549 |
with gr.Row():
|
@@ -589,7 +560,7 @@ def create_interface():
|
|
589 |
outputs=[history_display]
|
590 |
)
|
591 |
|
592 |
-
#
|
593 |
gr.HTML("""
|
594 |
<div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
|
595 |
<h3>π‘ How to Use Maya AI:</h3>
|
@@ -603,12 +574,12 @@ def create_interface():
|
|
603 |
</ol>
|
604 |
|
605 |
<div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
|
606 |
-
<p><strong>π§
|
607 |
<ul>
|
608 |
-
<li
|
609 |
-
<li
|
610 |
-
<li
|
611 |
-
<li
|
612 |
</ul>
|
613 |
</div>
|
614 |
</div>
|
|
|
83 |
print("β οΈ CUDA not available, using CPU")
|
84 |
|
85 |
def load_models():
|
86 |
+
"""Load all models with FIXED Dia loading"""
|
87 |
global asr_pipe, qwen_model, qwen_tokenizer, tts_model, tts_type
|
88 |
|
89 |
print("π Loading Maya AI models...")
|
|
|
104 |
print(f"β Error loading Whisper: {e}")
|
105 |
return False
|
106 |
|
107 |
+
# Load Qwen model
|
108 |
print("π§ Loading Qwen2.5-1.5B for conversation...")
|
109 |
try:
|
110 |
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
|
|
|
116 |
model_name,
|
117 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
118 |
device_map="auto" if torch.cuda.is_available() else None,
|
119 |
+
trust_remote_code=True
|
|
|
|
|
120 |
)
|
121 |
print("β
Qwen loaded successfully!")
|
122 |
optimize_gpu_memory()
|
|
|
124 |
print(f"β Error loading Qwen: {e}")
|
125 |
return False
|
126 |
|
127 |
+
# FIXED: Load Dia TTS without unsupported parameters
|
128 |
if DIA_AVAILABLE:
|
129 |
try:
|
130 |
+
print("Attempting to load Dia TTS with FIXED parameters...")
|
131 |
|
132 |
# Clear memory before loading Dia
|
133 |
optimize_gpu_memory()
|
134 |
|
135 |
+
# FIXED: Remove unsupported parameters
|
136 |
tts_model = Dia.from_pretrained(
|
137 |
"nari-labs/Dia-1.6B",
|
138 |
+
compute_dtype="float16" if torch.cuda.is_available() else "float32"
|
139 |
+
# Removed: low_cpu_mem_usage=True (not supported by Dia)
|
140 |
)
|
141 |
|
142 |
# Move to GPU if available
|
|
|
226 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
227 |
|
228 |
print("π Running Whisper ASR...")
|
229 |
+
result = asr_pipe(audio_data, language='en') # Force English
|
230 |
|
231 |
transcription = result['text'].strip()
|
232 |
print(f"Transcription: '{transcription}'")
|
|
|
246 |
def generate_contextual_response(user_input, emotion, conversation_manager):
|
247 |
"""Enhanced response generation with memory optimization"""
|
248 |
try:
|
|
|
249 |
optimize_gpu_memory()
|
250 |
|
251 |
context = conversation_manager.get_context()
|
|
|
288 |
with torch.no_grad():
|
289 |
generated_ids = qwen_model.generate(
|
290 |
model_inputs.input_ids,
|
291 |
+
max_new_tokens=50,
|
292 |
do_sample=True,
|
293 |
temperature=0.7,
|
294 |
top_p=0.9,
|
295 |
repetition_penalty=1.1,
|
296 |
pad_token_id=qwen_tokenizer.eos_token_id,
|
297 |
+
attention_mask=model_inputs.attention_mask
|
298 |
)
|
299 |
|
300 |
generated_ids = [
|
|
|
307 |
if response.startswith("Maya:"):
|
308 |
response = response[5:].strip()
|
309 |
|
|
|
310 |
optimize_gpu_memory()
|
311 |
|
312 |
return response
|
|
|
316 |
return "I'm sorry, I'm having trouble processing that right now."
|
317 |
|
318 |
def text_to_speech_emotional(text, emotion="neutral"):
|
319 |
+
"""FIXED TTS with proper Dia configuration"""
|
320 |
try:
|
321 |
if tts_model is None:
|
322 |
print(f"π Maya says ({emotion}): {text}")
|
323 |
return None
|
324 |
|
|
|
325 |
optimize_gpu_memory()
|
326 |
|
327 |
if tts_type == "dia":
|
328 |
+
# Simplified text processing for Dia
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
enhanced_text = f"[S1] {text}"
|
330 |
|
|
|
|
|
|
|
331 |
# Limit text length to prevent memory issues
|
332 |
if len(enhanced_text) > 200:
|
333 |
enhanced_text = enhanced_text[:200] + "..."
|
|
|
336 |
|
337 |
try:
|
338 |
with torch.no_grad():
|
|
|
339 |
audio_output = tts_model.generate(
|
340 |
enhanced_text,
|
341 |
use_torch_compile=False,
|
342 |
+
verbose=False
|
|
|
|
|
|
|
343 |
)
|
344 |
|
345 |
# Enhanced audio processing
|
|
|
350 |
if len(audio_output.shape) > 1:
|
351 |
audio_output = audio_output.squeeze()
|
352 |
|
353 |
+
# Conservative normalization
|
354 |
if len(audio_output) > 0:
|
355 |
# Remove DC offset
|
356 |
audio_output = audio_output - np.mean(audio_output)
|
357 |
|
358 |
+
# Gentle normalization
|
359 |
max_val = np.max(np.abs(audio_output))
|
360 |
if max_val > 0:
|
361 |
+
audio_output = audio_output / max_val * 0.8
|
362 |
|
363 |
# Ensure correct data type
|
364 |
audio_output = audio_output.astype(np.float32)
|
365 |
|
366 |
# Validate audio output
|
367 |
if np.any(np.isnan(audio_output)) or np.any(np.isinf(audio_output)):
|
368 |
+
print("β Audio contains NaN or Inf values")
|
369 |
return None
|
370 |
|
371 |
print(f"β
Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
|
372 |
|
|
|
373 |
optimize_gpu_memory()
|
374 |
|
|
|
375 |
return (44100, audio_output)
|
376 |
|
377 |
except Exception as e:
|
|
|
397 |
conv_manager.clear()
|
398 |
optimize_gpu_memory()
|
399 |
|
400 |
+
greeting_text = "Hello! I'm Maya. How can I help you today?"
|
401 |
greeting_audio = text_to_speech_emotional(greeting_text, "happy")
|
402 |
|
403 |
tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
|
404 |
return greeting_audio, greeting_text, f"π Call started! Maya is ready. {tts_status}"
|
405 |
|
406 |
def process_conversation(audio_input):
|
407 |
+
"""Main conversation processing pipeline"""
|
408 |
if audio_input is None:
|
409 |
return None, "Please record some audio first.", "", "β No audio input received."
|
410 |
|
|
|
471 |
return farewell_audio, farewell_text, "πβ Call ended. Thank you!"
|
472 |
|
473 |
def create_interface():
|
474 |
+
"""Create Gradio interface"""
|
475 |
with gr.Blocks(
|
476 |
title="Maya AI - Speech-to-Speech Assistant",
|
477 |
theme=gr.themes.Soft()
|
|
|
509 |
|
510 |
with gr.Column(scale=2):
|
511 |
gr.HTML("<h3>π Maya's Response</h3>")
|
|
|
512 |
response_audio = gr.Audio(
|
513 |
label="Maya's Voice Response",
|
514 |
type="numpy",
|
515 |
interactive=False,
|
516 |
autoplay=True,
|
517 |
+
show_download_button=True
|
|
|
|
|
|
|
|
|
|
|
518 |
)
|
519 |
|
520 |
with gr.Row():
|
|
|
560 |
outputs=[history_display]
|
561 |
)
|
562 |
|
563 |
+
# Instructions
|
564 |
gr.HTML("""
|
565 |
<div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
|
566 |
<h3>π‘ How to Use Maya AI:</h3>
|
|
|
574 |
</ol>
|
575 |
|
576 |
<div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
|
577 |
+
<p><strong>π§ Fixed Issues:</strong></p>
|
578 |
<ul>
|
579 |
+
<li>β
Pydantic version pinned to 2.10.6 (fixes Gradio crash)</li>
|
580 |
+
<li>β
Dia TTS loading parameters corrected</li>
|
581 |
+
<li>β
Memory optimization for T4 GPU</li>
|
582 |
+
<li>β
Audio processing enhanced</li>
|
583 |
</ul>
|
584 |
</div>
|
585 |
</div>
|