Update app.py
Browse files
app.py
CHANGED
@@ -75,7 +75,7 @@ def load_models():
|
|
75 |
|
76 |
print("π Loading Maya AI models...")
|
77 |
|
78 |
-
# Load ASR model (Whisper)
|
79 |
print("π€ Loading Whisper for ASR...")
|
80 |
try:
|
81 |
asr_pipe = pipeline(
|
@@ -83,7 +83,6 @@ def load_models():
|
|
83 |
model="openai/whisper-base",
|
84 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
85 |
device=0 if torch.cuda.is_available() else -1
|
86 |
-
# Removed return_timestamps and other problematic parameters
|
87 |
)
|
88 |
print("β
Whisper ASR loaded successfully!")
|
89 |
except Exception as e:
|
@@ -125,7 +124,6 @@ def load_models():
|
|
125 |
print(f"β οΈ Dia TTS failed to load: {e}")
|
126 |
tts_model = None
|
127 |
|
128 |
-
# Continue without TTS (text-only mode)
|
129 |
print("β οΈ No TTS available, running in text-only mode")
|
130 |
tts_type = "none"
|
131 |
return True
|
@@ -134,49 +132,39 @@ def detect_emotion_from_text(text):
|
|
134 |
"""Enhanced emotion detection from text"""
|
135 |
text_lower = text.lower()
|
136 |
|
137 |
-
# Enhanced emotion keywords with weights
|
138 |
emotions = {
|
139 |
'happy': ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing',
|
140 |
-
'fantastic', 'excellent', 'brilliant', 'perfect', 'love', 'joy', 'cheerful',
|
141 |
-
'delighted', 'thrilled', 'ecstatic'],
|
142 |
'sad': ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed',
|
143 |
-
'miserable', 'heartbroken', 'devastated', 'gloomy', 'melancholy',
|
144 |
-
'blue', 'sorrowful'],
|
145 |
'angry': ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate', 'rage',
|
146 |
-
'irritated', 'outraged', 'livid', 'enraged'
|
147 |
'surprised': ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking',
|
148 |
-
'astonishing', 'remarkable', 'extraordinary', 'mind-blowing',
|
149 |
-
|
150 |
-
'fearful': ['scared', 'afraid', 'terrified', 'worried', 'anxious', 'nervous',
|
151 |
-
'frightened', 'panic', 'concerned', 'fearful'],
|
152 |
-
'disgusted': ['disgusting', 'gross', 'revolting', 'sick', 'nauseating', 'repulsive',
|
153 |
-
'awful', 'horrible']
|
154 |
}
|
155 |
|
156 |
-
# Count emotion indicators
|
157 |
emotion_scores = {}
|
158 |
for emotion, keywords in emotions.items():
|
159 |
score = sum(1 for keyword in keywords if keyword in text_lower)
|
160 |
if score > 0:
|
161 |
emotion_scores[emotion] = score
|
162 |
|
163 |
-
# Return the emotion with highest score, or neutral if none found
|
164 |
if emotion_scores:
|
165 |
return max(emotion_scores, key=emotion_scores.get)
|
166 |
return 'neutral'
|
167 |
|
168 |
def speech_to_text_with_emotion(audio_input):
|
169 |
-
"""
|
170 |
try:
|
171 |
if audio_input is None:
|
172 |
return "", "neutral"
|
173 |
|
174 |
print("π€ Processing audio input...")
|
175 |
|
176 |
-
# Process audio input with enhanced handling
|
177 |
if isinstance(audio_input, tuple):
|
178 |
sample_rate, audio_data = audio_input
|
179 |
-
print(f"Audio input: sample_rate={sample_rate}, shape={audio_data.shape}
|
180 |
|
181 |
# Handle different audio formats
|
182 |
if audio_data.dtype == np.int16:
|
@@ -193,29 +181,25 @@ def speech_to_text_with_emotion(audio_input):
|
|
193 |
audio_data = audio_input
|
194 |
sample_rate = 16000
|
195 |
|
196 |
-
# Validate audio
|
197 |
-
if len(audio_data) < 1600:
|
198 |
return "Audio too short, please speak for at least 1 second", "neutral"
|
199 |
|
200 |
-
# Check for silence (audio with very low amplitude)
|
201 |
max_amplitude = np.max(np.abs(audio_data))
|
202 |
-
if max_amplitude < 0.01:
|
203 |
return "Audio too quiet, please speak louder", "neutral"
|
204 |
|
205 |
# Normalize audio
|
206 |
if max_amplitude > 0:
|
207 |
audio_data = audio_data / max_amplitude * 0.95
|
208 |
|
209 |
-
# Resample to 16kHz if needed
|
210 |
if sample_rate != 16000:
|
211 |
print(f"Resampling from {sample_rate}Hz to 16000Hz...")
|
212 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
213 |
|
214 |
-
print(f"Final audio: length={len(audio_data)}, max_amplitude={np.max(np.abs(audio_data)):.3f}")
|
215 |
-
|
216 |
-
# FIXED: Call ASR pipeline without sampling_rate parameter
|
217 |
print("π Running Whisper ASR...")
|
218 |
-
result = asr_pipe(audio_data)
|
219 |
|
220 |
transcription = result['text'].strip()
|
221 |
print(f"Transcription: '{transcription}'")
|
@@ -223,7 +207,6 @@ def speech_to_text_with_emotion(audio_input):
|
|
223 |
if not transcription or len(transcription) < 2:
|
224 |
return "No clear speech detected, please try speaking more clearly", "neutral"
|
225 |
|
226 |
-
# Detect emotion from transcription
|
227 |
emotion = detect_emotion_from_text(transcription)
|
228 |
print(f"Detected emotion: {emotion}")
|
229 |
|
@@ -234,39 +217,30 @@ def speech_to_text_with_emotion(audio_input):
|
|
234 |
return "Sorry, I couldn't understand that. Please try again.", "neutral"
|
235 |
|
236 |
def generate_contextual_response(user_input, emotion, conversation_manager):
|
237 |
-
"""Enhanced response generation
|
238 |
try:
|
239 |
context = conversation_manager.get_context()
|
240 |
|
241 |
-
# Enhanced emotional response styles
|
242 |
emotional_prompts = {
|
243 |
-
"happy": "Respond with genuine enthusiasm and joy. Use positive language
|
244 |
-
"sad": "Respond with
|
245 |
-
"angry": "Respond calmly and try to help. Be patient
|
246 |
-
"surprised": "Share in their surprise and show curiosity. Be engaging
|
247 |
-
"
|
248 |
-
"disgusted": "Respond with understanding while being helpful. Acknowledge their feelings and try to redirect positively.",
|
249 |
-
"neutral": "Respond naturally and conversationally. Be helpful, friendly, and engaging."
|
250 |
}
|
251 |
|
252 |
-
system_prompt = f"""You are Maya, a
|
253 |
|
254 |
{emotional_prompts.get(emotion, emotional_prompts['neutral'])}
|
255 |
|
256 |
-
Previous
|
257 |
-
{
|
258 |
-
|
259 |
-
Current user emotion detected: {emotion}
|
260 |
|
261 |
Guidelines:
|
262 |
-
- Keep responses concise
|
263 |
-
- Match the user's emotional tone appropriately
|
264 |
- Be natural and conversational
|
265 |
-
- Show
|
266 |
-
- Provide helpful
|
267 |
-
- Use natural speech patterns
|
268 |
-
- If they seem distressed, offer support
|
269 |
-
- If they're happy, celebrate with them
|
270 |
"""
|
271 |
|
272 |
messages = [
|
@@ -274,11 +248,8 @@ Guidelines:
|
|
274 |
{"role": "user", "content": user_input}
|
275 |
]
|
276 |
|
277 |
-
# Generate response with Qwen
|
278 |
text = qwen_tokenizer.apply_chat_template(
|
279 |
-
messages,
|
280 |
-
tokenize=False,
|
281 |
-
add_generation_prompt=True
|
282 |
)
|
283 |
|
284 |
model_inputs = qwen_tokenizer([text], return_tensors="pt")
|
@@ -301,9 +272,8 @@ Guidelines:
|
|
301 |
]
|
302 |
|
303 |
response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
304 |
-
|
305 |
-
# Clean up response
|
306 |
response = response.strip()
|
|
|
307 |
if response.startswith("Maya:"):
|
308 |
response = response[5:].strip()
|
309 |
|
@@ -311,10 +281,10 @@ Guidelines:
|
|
311 |
|
312 |
except Exception as e:
|
313 |
print(f"Error in response generation: {e}")
|
314 |
-
return "I'm sorry, I'm having trouble processing that right now.
|
315 |
|
316 |
def text_to_speech_emotional(text, emotion="neutral"):
|
317 |
-
"""
|
318 |
try:
|
319 |
if tts_model is None:
|
320 |
print(f"π Maya says ({emotion}): {text}")
|
@@ -325,21 +295,18 @@ def text_to_speech_emotional(text, emotion="neutral"):
|
|
325 |
torch.cuda.empty_cache()
|
326 |
|
327 |
if tts_type == "dia":
|
328 |
-
# Dia TTS with enhanced emotional markers
|
329 |
emotional_markers = {
|
330 |
"happy": "(excited) ",
|
331 |
"sad": "(sad) ",
|
332 |
-
"angry": "(calm) ",
|
333 |
"surprised": "(surprised) ",
|
334 |
-
"fearful": "(reassuring) ",
|
335 |
-
"disgusted": "(understanding) ",
|
336 |
"neutral": ""
|
337 |
}
|
338 |
|
339 |
-
# Enhanced text
|
340 |
enhanced_text = f"[S1] {emotional_markers.get(emotion, '')}{text}"
|
341 |
|
342 |
-
# Add
|
343 |
if len(text) > 50:
|
344 |
enhanced_text = enhanced_text.replace(". ", ". (pause) ")
|
345 |
enhanced_text = enhanced_text.replace("! ", "! (pause) ")
|
@@ -354,25 +321,34 @@ def text_to_speech_emotional(text, emotion="neutral"):
|
|
354 |
verbose=False
|
355 |
)
|
356 |
|
357 |
-
#
|
358 |
if isinstance(audio_output, torch.Tensor):
|
359 |
audio_output = audio_output.cpu().numpy()
|
360 |
|
361 |
-
#
|
|
|
|
|
|
|
|
|
362 |
if len(audio_output) > 0:
|
363 |
max_val = np.max(np.abs(audio_output))
|
364 |
-
if max_val >
|
365 |
audio_output = audio_output / max_val * 0.95
|
366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
return (44100, audio_output)
|
368 |
|
369 |
else:
|
370 |
-
# Text-only mode
|
371 |
print(f"π Maya says ({emotion}): {text}")
|
372 |
return None
|
373 |
|
374 |
except Exception as e:
|
375 |
-
print(f"Error in TTS: {e}")
|
376 |
print(f"π Maya says ({emotion}): {text}")
|
377 |
return None
|
378 |
|
@@ -382,53 +358,53 @@ conv_manager = ConversationManager()
|
|
382 |
def start_call():
|
383 |
"""Initialize call and return greeting"""
|
384 |
conv_manager.clear()
|
385 |
-
greeting_text = "Hello! I'm Maya, your AI assistant.
|
386 |
greeting_audio = text_to_speech_emotional(greeting_text, "happy")
|
387 |
|
388 |
tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
|
389 |
-
return greeting_audio, greeting_text, f"π Call started! Maya is ready
|
390 |
|
391 |
def process_conversation(audio_input):
|
392 |
-
"""
|
393 |
if audio_input is None:
|
394 |
return None, "Please record some audio first.", "", "β No audio input received."
|
395 |
|
396 |
try:
|
397 |
print("π Processing conversation...")
|
398 |
|
399 |
-
#
|
400 |
user_text, emotion = speech_to_text_with_emotion(audio_input)
|
401 |
|
402 |
-
# Check for
|
403 |
error_phrases = ["audio too short", "audio too quiet", "no clear speech", "sorry", "couldn't understand"]
|
404 |
if any(phrase in user_text.lower() for phrase in error_phrases):
|
405 |
return None, user_text, "", f"β STT Issue: {user_text}"
|
406 |
|
407 |
if not user_text or user_text.strip() == "":
|
408 |
-
return None, "I didn't catch that
|
409 |
|
410 |
-
#
|
411 |
ai_response = generate_contextual_response(user_text, emotion, conv_manager)
|
412 |
|
413 |
-
#
|
414 |
response_audio = text_to_speech_emotional(ai_response, emotion)
|
415 |
|
416 |
-
#
|
417 |
conv_manager.add_exchange(user_text, ai_response, emotion)
|
418 |
|
419 |
-
status = f"β
|
420 |
|
421 |
return response_audio, ai_response, user_text, status
|
422 |
|
423 |
except Exception as e:
|
424 |
-
error_msg = f"β Error
|
425 |
print(error_msg)
|
426 |
return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
|
427 |
|
428 |
def get_conversation_history():
|
429 |
-
"""Return
|
430 |
if not conv_manager.history:
|
431 |
-
return "No conversation history yet. Start a call to begin
|
432 |
|
433 |
history_text = "π **Conversation History:**\n\n"
|
434 |
for i, exchange in enumerate(conv_manager.history, 1):
|
@@ -440,118 +416,84 @@ def get_conversation_history():
|
|
440 |
return history_text
|
441 |
|
442 |
def end_call():
|
443 |
-
"""End call
|
444 |
-
farewell_text = "Thank you for
|
445 |
farewell_audio = text_to_speech_emotional(farewell_text, "happy")
|
446 |
conv_manager.clear()
|
447 |
|
448 |
-
return farewell_audio, farewell_text, "πβ Call ended. Thank you
|
449 |
|
450 |
def create_interface():
|
451 |
-
"""Create
|
452 |
with gr.Blocks(
|
453 |
-
title="Maya AI -
|
454 |
-
theme=gr.themes.Soft()
|
455 |
-
css="""
|
456 |
-
.main-header {
|
457 |
-
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
458 |
-
border-radius: 15px;
|
459 |
-
padding: 25px;
|
460 |
-
text-align: center;
|
461 |
-
margin-bottom: 25px;
|
462 |
-
box-shadow: 0 8px 32px rgba(0,0,0,0.1);
|
463 |
-
}
|
464 |
-
.call-button {
|
465 |
-
background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
|
466 |
-
border: none !important;
|
467 |
-
box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
|
468 |
-
}
|
469 |
-
.process-button {
|
470 |
-
background: linear-gradient(45deg, #45B7D1, #96CEB4) !important;
|
471 |
-
border: none !important;
|
472 |
-
box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
|
473 |
-
}
|
474 |
-
.end-button {
|
475 |
-
background: linear-gradient(45deg, #FFA07A, #FF6347) !important;
|
476 |
-
border: none !important;
|
477 |
-
box-shadow: 0 4px 15px rgba(0,0,0,0.2) !important;
|
478 |
-
}
|
479 |
-
"""
|
480 |
) as demo:
|
481 |
|
482 |
gr.HTML("""
|
483 |
-
<div
|
484 |
-
<h1 style="color: white; margin: 0; font-size: 2.8em;
|
485 |
<p style="color: white; margin: 15px 0; font-size: 1.3em;">Advanced Speech-to-Speech Conversational AI</p>
|
486 |
-
<p style="color: #E8E8E8; margin: 0;
|
487 |
</div>
|
488 |
""")
|
489 |
|
490 |
with gr.Row():
|
491 |
with gr.Column(scale=1):
|
492 |
-
|
493 |
-
gr.
|
494 |
-
|
495 |
-
end_btn = gr.Button("πβ End Call", elem_classes="end-button", size="lg")
|
496 |
|
497 |
-
|
498 |
-
gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>π€ Voice Input</h3>")
|
499 |
audio_input = gr.Audio(
|
500 |
-
label="Record Your Message (Speak clearly for
|
501 |
sources=["microphone"],
|
502 |
-
type="numpy"
|
503 |
-
format="wav"
|
504 |
)
|
505 |
|
506 |
-
process_btn = gr.Button("π― Process Message",
|
507 |
|
508 |
-
# Status Display
|
509 |
status_display = gr.Textbox(
|
510 |
label="π System Status",
|
511 |
interactive=False,
|
512 |
lines=3,
|
513 |
-
value="π
|
514 |
)
|
515 |
|
516 |
with gr.Column(scale=2):
|
517 |
-
|
518 |
-
|
519 |
response_audio = gr.Audio(
|
520 |
label="Maya's Voice Response",
|
521 |
type="numpy",
|
522 |
interactive=False,
|
523 |
-
autoplay=True
|
|
|
|
|
524 |
)
|
525 |
|
526 |
-
# Text Displays
|
527 |
with gr.Row():
|
528 |
with gr.Column():
|
529 |
user_text_display = gr.Textbox(
|
530 |
label="π€ What You Said",
|
531 |
interactive=False,
|
532 |
-
lines=4
|
533 |
-
placeholder="Your speech will appear here after processing..."
|
534 |
)
|
535 |
|
536 |
with gr.Column():
|
537 |
ai_text_display = gr.Textbox(
|
538 |
label="π€ Maya's Response",
|
539 |
interactive=False,
|
540 |
-
lines=4
|
541 |
-
placeholder="Maya's response will appear here..."
|
542 |
)
|
543 |
|
544 |
-
# Conversation History Section
|
545 |
with gr.Row():
|
546 |
with gr.Column():
|
547 |
-
gr.HTML("<h3
|
548 |
-
history_btn = gr.Button("π Show
|
549 |
-
history_display = gr.Markdown(
|
550 |
-
value="No conversation history yet. Start a call to begin chatting with Maya!",
|
551 |
-
label="Conversation Log"
|
552 |
-
)
|
553 |
|
554 |
-
# Event
|
555 |
start_btn.click(
|
556 |
fn=start_call,
|
557 |
outputs=[response_audio, ai_text_display, status_display]
|
@@ -573,42 +515,26 @@ def create_interface():
|
|
573 |
outputs=[history_display]
|
574 |
)
|
575 |
|
576 |
-
#
|
577 |
gr.HTML("""
|
578 |
-
<div style="margin-top: 30px; padding: 25px; background:
|
579 |
-
<h3
|
580 |
-
<
|
581 |
-
<
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
<li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
|
589 |
-
<li><strong>End:</strong> Click "πβ End Call" when finished</li>
|
590 |
-
</ol>
|
591 |
-
</div>
|
592 |
-
<div>
|
593 |
-
<h4 style="color: #28a745;">π Features:</h4>
|
594 |
-
<ul style="color: #495057;">
|
595 |
-
<li>π€ <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
|
596 |
-
<li>π§ <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
|
597 |
-
<li>π <strong>Emotion Detection:</strong> Advanced emotion recognition</li>
|
598 |
-
<li>π <strong>Natural TTS:</strong> High-quality Dia TTS synthesis</li>
|
599 |
-
<li>π <strong>Context Memory:</strong> Remembers conversation flow</li>
|
600 |
-
<li>β€οΈ <strong>Emotional Intelligence:</strong> Responds to your emotions</li>
|
601 |
-
</ul>
|
602 |
-
</div>
|
603 |
-
</div>
|
604 |
|
605 |
-
<div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;
|
606 |
-
<p
|
607 |
-
<ul
|
608 |
<li>Speak clearly and close to your microphone</li>
|
609 |
<li>Record for at least 2-3 seconds</li>
|
610 |
-
<li>
|
611 |
-
<li>Maya
|
612 |
</ul>
|
613 |
</div>
|
614 |
</div>
|
@@ -619,7 +545,6 @@ def create_interface():
|
|
619 |
if __name__ == "__main__":
|
620 |
print("π Initializing Maya AI System...")
|
621 |
|
622 |
-
# Check system info
|
623 |
check_system_info()
|
624 |
|
625 |
if load_models():
|
@@ -632,8 +557,7 @@ if __name__ == "__main__":
|
|
632 |
server_name="0.0.0.0",
|
633 |
server_port=7860,
|
634 |
share=True,
|
635 |
-
show_error=True
|
636 |
-
debug=False
|
637 |
)
|
638 |
else:
|
639 |
-
print("β Failed to load models.
|
|
|
75 |
|
76 |
print("π Loading Maya AI models...")
|
77 |
|
78 |
+
# Load ASR model (Whisper)
|
79 |
print("π€ Loading Whisper for ASR...")
|
80 |
try:
|
81 |
asr_pipe = pipeline(
|
|
|
83 |
model="openai/whisper-base",
|
84 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
85 |
device=0 if torch.cuda.is_available() else -1
|
|
|
86 |
)
|
87 |
print("β
Whisper ASR loaded successfully!")
|
88 |
except Exception as e:
|
|
|
124 |
print(f"β οΈ Dia TTS failed to load: {e}")
|
125 |
tts_model = None
|
126 |
|
|
|
127 |
print("β οΈ No TTS available, running in text-only mode")
|
128 |
tts_type = "none"
|
129 |
return True
|
|
|
132 |
"""Enhanced emotion detection from text"""
|
133 |
text_lower = text.lower()
|
134 |
|
|
|
135 |
emotions = {
|
136 |
'happy': ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing',
|
137 |
+
'fantastic', 'excellent', 'brilliant', 'perfect', 'love', 'joy', 'cheerful'],
|
|
|
138 |
'sad': ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed',
|
139 |
+
'miserable', 'heartbroken', 'devastated', 'gloomy', 'melancholy'],
|
|
|
140 |
'angry': ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate', 'rage',
|
141 |
+
'irritated', 'outraged', 'livid', 'enraged'],
|
142 |
'surprised': ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking',
|
143 |
+
'astonishing', 'remarkable', 'extraordinary', 'mind-blowing'],
|
144 |
+
'neutral': []
|
|
|
|
|
|
|
|
|
145 |
}
|
146 |
|
|
|
147 |
emotion_scores = {}
|
148 |
for emotion, keywords in emotions.items():
|
149 |
score = sum(1 for keyword in keywords if keyword in text_lower)
|
150 |
if score > 0:
|
151 |
emotion_scores[emotion] = score
|
152 |
|
|
|
153 |
if emotion_scores:
|
154 |
return max(emotion_scores, key=emotion_scores.get)
|
155 |
return 'neutral'
|
156 |
|
157 |
def speech_to_text_with_emotion(audio_input):
|
158 |
+
"""Enhanced STT with proper audio processing"""
|
159 |
try:
|
160 |
if audio_input is None:
|
161 |
return "", "neutral"
|
162 |
|
163 |
print("π€ Processing audio input...")
|
164 |
|
|
|
165 |
if isinstance(audio_input, tuple):
|
166 |
sample_rate, audio_data = audio_input
|
167 |
+
print(f"Audio input: sample_rate={sample_rate}, shape={audio_data.shape}")
|
168 |
|
169 |
# Handle different audio formats
|
170 |
if audio_data.dtype == np.int16:
|
|
|
181 |
audio_data = audio_input
|
182 |
sample_rate = 16000
|
183 |
|
184 |
+
# Validate audio
|
185 |
+
if len(audio_data) < 1600:
|
186 |
return "Audio too short, please speak for at least 1 second", "neutral"
|
187 |
|
|
|
188 |
max_amplitude = np.max(np.abs(audio_data))
|
189 |
+
if max_amplitude < 0.01:
|
190 |
return "Audio too quiet, please speak louder", "neutral"
|
191 |
|
192 |
# Normalize audio
|
193 |
if max_amplitude > 0:
|
194 |
audio_data = audio_data / max_amplitude * 0.95
|
195 |
|
196 |
+
# Resample to 16kHz if needed
|
197 |
if sample_rate != 16000:
|
198 |
print(f"Resampling from {sample_rate}Hz to 16000Hz...")
|
199 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
200 |
|
|
|
|
|
|
|
201 |
print("π Running Whisper ASR...")
|
202 |
+
result = asr_pipe(audio_data)
|
203 |
|
204 |
transcription = result['text'].strip()
|
205 |
print(f"Transcription: '{transcription}'")
|
|
|
207 |
if not transcription or len(transcription) < 2:
|
208 |
return "No clear speech detected, please try speaking more clearly", "neutral"
|
209 |
|
|
|
210 |
emotion = detect_emotion_from_text(transcription)
|
211 |
print(f"Detected emotion: {emotion}")
|
212 |
|
|
|
217 |
return "Sorry, I couldn't understand that. Please try again.", "neutral"
|
218 |
|
219 |
def generate_contextual_response(user_input, emotion, conversation_manager):
|
220 |
+
"""Enhanced response generation"""
|
221 |
try:
|
222 |
context = conversation_manager.get_context()
|
223 |
|
|
|
224 |
emotional_prompts = {
|
225 |
+
"happy": "Respond with genuine enthusiasm and joy. Use positive language and show excitement.",
|
226 |
+
"sad": "Respond with empathy and comfort. Be gentle and understanding.",
|
227 |
+
"angry": "Respond calmly and try to help. Be patient and de-escalate.",
|
228 |
+
"surprised": "Share in their surprise and show curiosity. Be engaging.",
|
229 |
+
"neutral": "Respond naturally and conversationally. Be helpful and friendly."
|
|
|
|
|
230 |
}
|
231 |
|
232 |
+
system_prompt = f"""You are Maya, a friendly AI assistant with emotional intelligence.
|
233 |
|
234 |
{emotional_prompts.get(emotion, emotional_prompts['neutral'])}
|
235 |
|
236 |
+
Previous context: {context}
|
237 |
+
User emotion: {emotion}
|
|
|
|
|
238 |
|
239 |
Guidelines:
|
240 |
+
- Keep responses concise (1-2 sentences)
|
|
|
241 |
- Be natural and conversational
|
242 |
+
- Show empathy and understanding
|
243 |
+
- Provide helpful responses
|
|
|
|
|
|
|
244 |
"""
|
245 |
|
246 |
messages = [
|
|
|
248 |
{"role": "user", "content": user_input}
|
249 |
]
|
250 |
|
|
|
251 |
text = qwen_tokenizer.apply_chat_template(
|
252 |
+
messages, tokenize=False, add_generation_prompt=True
|
|
|
|
|
253 |
)
|
254 |
|
255 |
model_inputs = qwen_tokenizer([text], return_tensors="pt")
|
|
|
272 |
]
|
273 |
|
274 |
response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
|
|
|
|
275 |
response = response.strip()
|
276 |
+
|
277 |
if response.startswith("Maya:"):
|
278 |
response = response[5:].strip()
|
279 |
|
|
|
281 |
|
282 |
except Exception as e:
|
283 |
print(f"Error in response generation: {e}")
|
284 |
+
return "I'm sorry, I'm having trouble processing that right now."
|
285 |
|
286 |
def text_to_speech_emotional(text, emotion="neutral"):
|
287 |
+
"""FIXED TTS with proper audio format for Gradio"""
|
288 |
try:
|
289 |
if tts_model is None:
|
290 |
print(f"π Maya says ({emotion}): {text}")
|
|
|
295 |
torch.cuda.empty_cache()
|
296 |
|
297 |
if tts_type == "dia":
|
|
|
298 |
emotional_markers = {
|
299 |
"happy": "(excited) ",
|
300 |
"sad": "(sad) ",
|
301 |
+
"angry": "(calm) ",
|
302 |
"surprised": "(surprised) ",
|
|
|
|
|
303 |
"neutral": ""
|
304 |
}
|
305 |
|
306 |
+
# Enhanced text for Dia
|
307 |
enhanced_text = f"[S1] {emotional_markers.get(emotion, '')}{text}"
|
308 |
|
309 |
+
# Add pauses for natural speech
|
310 |
if len(text) > 50:
|
311 |
enhanced_text = enhanced_text.replace(". ", ". (pause) ")
|
312 |
enhanced_text = enhanced_text.replace("! ", "! (pause) ")
|
|
|
321 |
verbose=False
|
322 |
)
|
323 |
|
324 |
+
# FIXED: Proper audio processing for Gradio
|
325 |
if isinstance(audio_output, torch.Tensor):
|
326 |
audio_output = audio_output.cpu().numpy()
|
327 |
|
328 |
+
# Ensure audio is in the right format
|
329 |
+
if len(audio_output.shape) > 1:
|
330 |
+
audio_output = audio_output.squeeze()
|
331 |
+
|
332 |
+
# Normalize audio properly
|
333 |
if len(audio_output) > 0:
|
334 |
max_val = np.max(np.abs(audio_output))
|
335 |
+
if max_val > 0:
|
336 |
audio_output = audio_output / max_val * 0.95
|
337 |
|
338 |
+
# CRITICAL FIX: Ensure audio is float32 and in correct range
|
339 |
+
audio_output = audio_output.astype(np.float32)
|
340 |
+
|
341 |
+
print(f"β
Generated audio: shape={audio_output.shape}, dtype={audio_output.dtype}, range=[{audio_output.min():.3f}, {audio_output.max():.3f}]")
|
342 |
+
|
343 |
+
# Return in format Gradio expects: (sample_rate, audio_array)
|
344 |
return (44100, audio_output)
|
345 |
|
346 |
else:
|
|
|
347 |
print(f"π Maya says ({emotion}): {text}")
|
348 |
return None
|
349 |
|
350 |
except Exception as e:
|
351 |
+
print(f"β Error in TTS: {e}")
|
352 |
print(f"π Maya says ({emotion}): {text}")
|
353 |
return None
|
354 |
|
|
|
358 |
def start_call():
|
359 |
"""Initialize call and return greeting"""
|
360 |
conv_manager.clear()
|
361 |
+
greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
|
362 |
greeting_audio = text_to_speech_emotional(greeting_text, "happy")
|
363 |
|
364 |
tts_status = f"Using {tts_type.upper()} TTS" if tts_type != "none" else "Text-only mode"
|
365 |
+
return greeting_audio, greeting_text, f"π Call started! Maya is ready. {tts_status}"
|
366 |
|
367 |
def process_conversation(audio_input):
|
368 |
+
"""Main conversation processing pipeline"""
|
369 |
if audio_input is None:
|
370 |
return None, "Please record some audio first.", "", "β No audio input received."
|
371 |
|
372 |
try:
|
373 |
print("π Processing conversation...")
|
374 |
|
375 |
+
# STT + Emotion Detection
|
376 |
user_text, emotion = speech_to_text_with_emotion(audio_input)
|
377 |
|
378 |
+
# Check for STT errors
|
379 |
error_phrases = ["audio too short", "audio too quiet", "no clear speech", "sorry", "couldn't understand"]
|
380 |
if any(phrase in user_text.lower() for phrase in error_phrases):
|
381 |
return None, user_text, "", f"β STT Issue: {user_text}"
|
382 |
|
383 |
if not user_text or user_text.strip() == "":
|
384 |
+
return None, "I didn't catch that. Please speak louder and closer to the microphone.", "", "β No speech detected."
|
385 |
|
386 |
+
# Generate response
|
387 |
ai_response = generate_contextual_response(user_text, emotion, conv_manager)
|
388 |
|
389 |
+
# Convert to speech
|
390 |
response_audio = text_to_speech_emotional(ai_response, emotion)
|
391 |
|
392 |
+
# Update history
|
393 |
conv_manager.add_exchange(user_text, ai_response, emotion)
|
394 |
|
395 |
+
status = f"β
Success! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5 | TTS: {tts_type.upper()}"
|
396 |
|
397 |
return response_audio, ai_response, user_text, status
|
398 |
|
399 |
except Exception as e:
|
400 |
+
error_msg = f"β Error: {str(e)}"
|
401 |
print(error_msg)
|
402 |
return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
|
403 |
|
404 |
def get_conversation_history():
|
405 |
+
"""Return conversation history"""
|
406 |
if not conv_manager.history:
|
407 |
+
return "No conversation history yet. Start a call to begin!"
|
408 |
|
409 |
history_text = "π **Conversation History:**\n\n"
|
410 |
for i, exchange in enumerate(conv_manager.history, 1):
|
|
|
416 |
return history_text
|
417 |
|
418 |
def end_call():
|
419 |
+
"""End call"""
|
420 |
+
farewell_text = "Thank you for talking with me! Have a wonderful day!"
|
421 |
farewell_audio = text_to_speech_emotional(farewell_text, "happy")
|
422 |
conv_manager.clear()
|
423 |
|
424 |
+
return farewell_audio, farewell_text, "πβ Call ended. Thank you!"
|
425 |
|
426 |
def create_interface():
|
427 |
+
"""Create Gradio interface with FIXED audio components"""
|
428 |
with gr.Blocks(
|
429 |
+
title="Maya AI - Speech-to-Speech Assistant",
|
430 |
+
theme=gr.themes.Soft()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
) as demo:
|
432 |
|
433 |
gr.HTML("""
|
434 |
+
<div style="text-align: center; padding: 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 25px;">
|
435 |
+
<h1 style="color: white; margin: 0; font-size: 2.8em;">ποΈ Maya AI</h1>
|
436 |
<p style="color: white; margin: 15px 0; font-size: 1.3em;">Advanced Speech-to-Speech Conversational AI</p>
|
437 |
+
<p style="color: #E8E8E8; margin: 0;">Natural β’ Emotional β’ Contextual β’ Intelligent</p>
|
438 |
</div>
|
439 |
""")
|
440 |
|
441 |
with gr.Row():
|
442 |
with gr.Column(scale=1):
|
443 |
+
gr.HTML("<h3>π Call Controls</h3>")
|
444 |
+
start_btn = gr.Button("π Start Call", variant="primary", size="lg")
|
445 |
+
end_btn = gr.Button("πβ End Call", variant="secondary", size="lg")
|
|
|
446 |
|
447 |
+
gr.HTML("<h3>π€ Voice Input</h3>")
|
|
|
448 |
audio_input = gr.Audio(
|
449 |
+
label="Record Your Message (Speak clearly for 2+ seconds)",
|
450 |
sources=["microphone"],
|
451 |
+
type="numpy"
|
|
|
452 |
)
|
453 |
|
454 |
+
process_btn = gr.Button("π― Process Message", variant="primary", size="lg")
|
455 |
|
|
|
456 |
status_display = gr.Textbox(
|
457 |
label="π System Status",
|
458 |
interactive=False,
|
459 |
lines=3,
|
460 |
+
value="π Ready! Click 'Start Call' to begin."
|
461 |
)
|
462 |
|
463 |
with gr.Column(scale=2):
|
464 |
+
gr.HTML("<h3>π Maya's Response</h3>")
|
465 |
+
# FIXED: Audio component with proper settings
|
466 |
response_audio = gr.Audio(
|
467 |
label="Maya's Voice Response",
|
468 |
type="numpy",
|
469 |
interactive=False,
|
470 |
+
autoplay=True, # Enable autoplay
|
471 |
+
show_download_button=True,
|
472 |
+
show_share_button=False
|
473 |
)
|
474 |
|
|
|
475 |
with gr.Row():
|
476 |
with gr.Column():
|
477 |
user_text_display = gr.Textbox(
|
478 |
label="π€ What You Said",
|
479 |
interactive=False,
|
480 |
+
lines=4
|
|
|
481 |
)
|
482 |
|
483 |
with gr.Column():
|
484 |
ai_text_display = gr.Textbox(
|
485 |
label="π€ Maya's Response",
|
486 |
interactive=False,
|
487 |
+
lines=4
|
|
|
488 |
)
|
489 |
|
|
|
490 |
with gr.Row():
|
491 |
with gr.Column():
|
492 |
+
gr.HTML("<h3>π Conversation History</h3>")
|
493 |
+
history_btn = gr.Button("π Show History", variant="secondary")
|
494 |
+
history_display = gr.Markdown("No conversation history yet.")
|
|
|
|
|
|
|
495 |
|
496 |
+
# Event handlers
|
497 |
start_btn.click(
|
498 |
fn=start_call,
|
499 |
outputs=[response_audio, ai_text_display, status_display]
|
|
|
515 |
outputs=[history_display]
|
516 |
)
|
517 |
|
518 |
+
# Instructions
|
519 |
gr.HTML("""
|
520 |
+
<div style="margin-top: 30px; padding: 25px; background: #f8f9fa; border-radius: 15px;">
|
521 |
+
<h3>π‘ How to Use Maya AI:</h3>
|
522 |
+
<ol>
|
523 |
+
<li><strong>Start Call:</strong> Click "π Start Call" - Maya will greet you</li>
|
524 |
+
<li><strong>Record:</strong> Speak clearly for at least 2 seconds</li>
|
525 |
+
<li><strong>Process:</strong> Click "π― Process Message"</li>
|
526 |
+
<li><strong>Listen:</strong> Maya will respond with natural speech</li>
|
527 |
+
<li><strong>Continue:</strong> Keep chatting (up to 5 exchanges)</li>
|
528 |
+
<li><strong>End:</strong> Click "πβ End Call" when done</li>
|
529 |
+
</ol>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
|
531 |
+
<div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px;">
|
532 |
+
<p><strong>π‘ Pro Tips:</strong></p>
|
533 |
+
<ul>
|
534 |
<li>Speak clearly and close to your microphone</li>
|
535 |
<li>Record for at least 2-3 seconds</li>
|
536 |
+
<li>Use a quiet environment for best results</li>
|
537 |
+
<li>Maya detects emotions and responds accordingly!</li>
|
538 |
</ul>
|
539 |
</div>
|
540 |
</div>
|
|
|
545 |
if __name__ == "__main__":
|
546 |
print("π Initializing Maya AI System...")
|
547 |
|
|
|
548 |
check_system_info()
|
549 |
|
550 |
if load_models():
|
|
|
557 |
server_name="0.0.0.0",
|
558 |
server_port=7860,
|
559 |
share=True,
|
560 |
+
show_error=True
|
|
|
561 |
)
|
562 |
else:
|
563 |
+
print("β Failed to load models.")
|