Update app.py
Browse files
app.py
CHANGED
@@ -20,15 +20,6 @@ except ImportError as e:
|
|
20 |
print(f"β οΈ Dia TTS not available: {e}")
|
21 |
DIA_AVAILABLE = False
|
22 |
|
23 |
-
# Fallback TTS import
|
24 |
-
try:
|
25 |
-
from TTS.api import TTS
|
26 |
-
COQUI_TTS_AVAILABLE = True
|
27 |
-
print("β
Coqui TTS library available as fallback")
|
28 |
-
except ImportError:
|
29 |
-
COQUI_TTS_AVAILABLE = False
|
30 |
-
print("β οΈ Coqui TTS not available")
|
31 |
-
|
32 |
warnings.filterwarnings("ignore")
|
33 |
|
34 |
# Global models
|
@@ -36,7 +27,7 @@ asr_pipe = None
|
|
36 |
qwen_model = None
|
37 |
qwen_tokenizer = None
|
38 |
tts_model = None
|
39 |
-
tts_type = None
|
40 |
|
41 |
class ConversationManager:
|
42 |
def __init__(self, max_exchanges=5):
|
@@ -84,15 +75,15 @@ def load_models():
|
|
84 |
|
85 |
print("π Loading Maya AI models...")
|
86 |
|
87 |
-
# Load ASR model (Whisper)
|
88 |
print("π€ Loading Whisper for ASR...")
|
89 |
try:
|
90 |
asr_pipe = pipeline(
|
91 |
"automatic-speech-recognition",
|
92 |
model="openai/whisper-base",
|
93 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
94 |
-
device=0 if torch.cuda.is_available() else -1
|
95 |
-
return_timestamps
|
96 |
)
|
97 |
print("β
Whisper ASR loaded successfully!")
|
98 |
except Exception as e:
|
@@ -119,10 +110,7 @@ def load_models():
|
|
119 |
print(f"β Error loading Qwen: {e}")
|
120 |
return False
|
121 |
|
122 |
-
# Load
|
123 |
-
print("ποΈ Loading TTS model...")
|
124 |
-
|
125 |
-
# Try Dia TTS first (preferred)
|
126 |
if DIA_AVAILABLE:
|
127 |
try:
|
128 |
print("Attempting to load Dia TTS...")
|
@@ -137,20 +125,6 @@ def load_models():
|
|
137 |
print(f"β οΈ Dia TTS failed to load: {e}")
|
138 |
tts_model = None
|
139 |
|
140 |
-
# Fallback to Coqui TTS
|
141 |
-
if COQUI_TTS_AVAILABLE:
|
142 |
-
try:
|
143 |
-
print("Attempting to load Coqui TTS as fallback...")
|
144 |
-
tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
145 |
-
if torch.cuda.is_available():
|
146 |
-
tts_model = tts_model.to("cuda")
|
147 |
-
tts_type = "coqui"
|
148 |
-
print("β
Coqui TTS loaded successfully!")
|
149 |
-
return True
|
150 |
-
except Exception as e:
|
151 |
-
print(f"β οΈ Coqui TTS failed to load: {e}")
|
152 |
-
tts_model = None
|
153 |
-
|
154 |
# Continue without TTS (text-only mode)
|
155 |
print("β οΈ No TTS available, running in text-only mode")
|
156 |
tts_type = "none"
|
@@ -192,14 +166,17 @@ def detect_emotion_from_text(text):
|
|
192 |
return 'neutral'
|
193 |
|
194 |
def speech_to_text_with_emotion(audio_input):
|
195 |
-
"""
|
196 |
try:
|
197 |
if audio_input is None:
|
198 |
return "", "neutral"
|
199 |
|
200 |
-
|
|
|
|
|
201 |
if isinstance(audio_input, tuple):
|
202 |
sample_rate, audio_data = audio_input
|
|
|
203 |
|
204 |
# Handle different audio formats
|
205 |
if audio_data.dtype == np.int16:
|
@@ -218,33 +195,43 @@ def speech_to_text_with_emotion(audio_input):
|
|
218 |
|
219 |
# Validate audio length
|
220 |
if len(audio_data) < 1600: # Less than 0.1 seconds at 16kHz
|
221 |
-
return "Audio too short, please speak
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
# Normalize audio
|
224 |
-
if
|
225 |
-
|
226 |
-
if max_val > 0:
|
227 |
-
audio_data = audio_data / max_val * 0.95
|
228 |
|
229 |
-
# Resample to 16kHz if needed
|
230 |
if sample_rate != 16000:
|
|
|
231 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
232 |
|
233 |
-
|
234 |
-
|
|
|
|
|
|
|
|
|
235 |
transcription = result['text'].strip()
|
|
|
236 |
|
237 |
-
if not transcription:
|
238 |
-
return "No speech detected", "neutral"
|
239 |
|
240 |
# Detect emotion from transcription
|
241 |
emotion = detect_emotion_from_text(transcription)
|
|
|
242 |
|
243 |
return transcription, emotion
|
244 |
|
245 |
except Exception as e:
|
246 |
-
print(f"Error in STT: {e}")
|
247 |
-
return "Sorry, I couldn't understand that.", "neutral"
|
248 |
|
249 |
def generate_contextual_response(user_input, emotion, conversation_manager):
|
250 |
"""Enhanced response generation with better emotional intelligence"""
|
@@ -327,7 +314,7 @@ Guidelines:
|
|
327 |
return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
|
328 |
|
329 |
def text_to_speech_emotional(text, emotion="neutral"):
|
330 |
-
"""Enhanced TTS with
|
331 |
try:
|
332 |
if tts_model is None:
|
333 |
print(f"π Maya says ({emotion}): {text}")
|
@@ -378,38 +365,6 @@ def text_to_speech_emotional(text, emotion="neutral"):
|
|
378 |
audio_output = audio_output / max_val * 0.95
|
379 |
|
380 |
return (44100, audio_output)
|
381 |
-
|
382 |
-
elif tts_type == "coqui":
|
383 |
-
# Coqui TTS processing
|
384 |
-
emotional_prefixes = {
|
385 |
-
"happy": "[Speaking with joy] ",
|
386 |
-
"sad": "[Speaking gently] ",
|
387 |
-
"angry": "[Speaking calmly] ",
|
388 |
-
"surprised": "[Speaking with excitement] ",
|
389 |
-
"fearful": "[Speaking reassuringly] ",
|
390 |
-
"disgusted": "[Speaking understandingly] ",
|
391 |
-
"neutral": ""
|
392 |
-
}
|
393 |
-
|
394 |
-
enhanced_text = f"{emotional_prefixes.get(emotion, '')}{text}"
|
395 |
-
|
396 |
-
print(f"Generating Coqui TTS for: {enhanced_text}")
|
397 |
-
|
398 |
-
audio_output = tts_model.tts(text=enhanced_text)
|
399 |
-
|
400 |
-
# Convert to numpy array if needed
|
401 |
-
if isinstance(audio_output, list):
|
402 |
-
audio_output = np.array(audio_output, dtype=np.float32)
|
403 |
-
elif torch.is_tensor(audio_output):
|
404 |
-
audio_output = audio_output.cpu().numpy().astype(np.float32)
|
405 |
-
|
406 |
-
# Normalize audio
|
407 |
-
if len(audio_output) > 0:
|
408 |
-
max_val = np.max(np.abs(audio_output))
|
409 |
-
if max_val > 1.0:
|
410 |
-
audio_output = audio_output / max_val * 0.95
|
411 |
-
|
412 |
-
return (22050, audio_output)
|
413 |
|
414 |
else:
|
415 |
# Text-only mode
|
@@ -439,11 +394,18 @@ def process_conversation(audio_input):
|
|
439 |
return None, "Please record some audio first.", "", "β No audio input received."
|
440 |
|
441 |
try:
|
|
|
|
|
442 |
# Step 1: Speech to Text + Emotion Detection
|
443 |
user_text, emotion = speech_to_text_with_emotion(audio_input)
|
444 |
|
445 |
-
|
446 |
-
|
|
|
|
|
|
|
|
|
|
|
447 |
|
448 |
# Step 2: Generate contextual response
|
449 |
ai_response = generate_contextual_response(user_text, emotion, conv_manager)
|
@@ -535,7 +497,7 @@ def create_interface():
|
|
535 |
# Audio Input
|
536 |
gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>π€ Voice Input</h3>")
|
537 |
audio_input = gr.Audio(
|
538 |
-
label="Record Your Message",
|
539 |
sources=["microphone"],
|
540 |
type="numpy",
|
541 |
format="wav"
|
@@ -620,7 +582,7 @@ def create_interface():
|
|
620 |
<h4 style="color: #007bff;">π Getting Started:</h4>
|
621 |
<ol style="color: #495057;">
|
622 |
<li><strong>Start Call:</strong> Click "π Start Call" to initialize Maya</li>
|
623 |
-
<li><strong>Record:</strong>
|
624 |
<li><strong>Process:</strong> Click "π― Process Message" to get Maya's response</li>
|
625 |
<li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
|
626 |
<li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
|
@@ -628,20 +590,26 @@ def create_interface():
|
|
628 |
</ol>
|
629 |
</div>
|
630 |
<div>
|
631 |
-
<h4 style="color: #28a745;">π
|
632 |
<ul style="color: #495057;">
|
633 |
<li>π€ <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
|
634 |
<li>π§ <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
|
635 |
-
<li>π <strong>Emotion Detection:</strong> Advanced emotion recognition
|
636 |
-
<li>π <strong>Natural TTS:</strong> High-quality
|
637 |
-
<li>π <strong>Context Memory:</strong> Remembers conversation flow
|
638 |
-
<li>β€οΈ <strong>Emotional Intelligence:</strong> Responds
|
639 |
</ul>
|
640 |
</div>
|
641 |
</div>
|
642 |
|
643 |
<div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px; border-left: 4px solid #bee5eb;">
|
644 |
-
<p style="margin: 0; color: #0c5460;"><strong>π‘ Pro
|
|
|
|
|
|
|
|
|
|
|
|
|
645 |
</div>
|
646 |
</div>
|
647 |
""")
|
|
|
20 |
print(f"β οΈ Dia TTS not available: {e}")
|
21 |
DIA_AVAILABLE = False
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
warnings.filterwarnings("ignore")
|
24 |
|
25 |
# Global models
|
|
|
27 |
qwen_model = None
|
28 |
qwen_tokenizer = None
|
29 |
tts_model = None
|
30 |
+
tts_type = None
|
31 |
|
32 |
class ConversationManager:
|
33 |
def __init__(self, max_exchanges=5):
|
|
|
75 |
|
76 |
print("π Loading Maya AI models...")
|
77 |
|
78 |
+
# Load ASR model (Whisper) - FIXED VERSION
|
79 |
print("π€ Loading Whisper for ASR...")
|
80 |
try:
|
81 |
asr_pipe = pipeline(
|
82 |
"automatic-speech-recognition",
|
83 |
model="openai/whisper-base",
|
84 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
85 |
+
device=0 if torch.cuda.is_available() else -1
|
86 |
+
# Removed return_timestamps and other problematic parameters
|
87 |
)
|
88 |
print("β
Whisper ASR loaded successfully!")
|
89 |
except Exception as e:
|
|
|
110 |
print(f"β Error loading Qwen: {e}")
|
111 |
return False
|
112 |
|
113 |
+
# Load Dia TTS
|
|
|
|
|
|
|
114 |
if DIA_AVAILABLE:
|
115 |
try:
|
116 |
print("Attempting to load Dia TTS...")
|
|
|
125 |
print(f"β οΈ Dia TTS failed to load: {e}")
|
126 |
tts_model = None
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
# Continue without TTS (text-only mode)
|
129 |
print("β οΈ No TTS available, running in text-only mode")
|
130 |
tts_type = "none"
|
|
|
166 |
return 'neutral'
|
167 |
|
168 |
def speech_to_text_with_emotion(audio_input):
|
169 |
+
"""FIXED STT function with proper audio processing"""
|
170 |
try:
|
171 |
if audio_input is None:
|
172 |
return "", "neutral"
|
173 |
|
174 |
+
print("π€ Processing audio input...")
|
175 |
+
|
176 |
+
# Process audio input with enhanced handling
|
177 |
if isinstance(audio_input, tuple):
|
178 |
sample_rate, audio_data = audio_input
|
179 |
+
print(f"Audio input: sample_rate={sample_rate}, shape={audio_data.shape}, dtype={audio_data.dtype}")
|
180 |
|
181 |
# Handle different audio formats
|
182 |
if audio_data.dtype == np.int16:
|
|
|
195 |
|
196 |
# Validate audio length
|
197 |
if len(audio_data) < 1600: # Less than 0.1 seconds at 16kHz
|
198 |
+
return "Audio too short, please speak for at least 1 second", "neutral"
|
199 |
+
|
200 |
+
# Check for silence (audio with very low amplitude)
|
201 |
+
max_amplitude = np.max(np.abs(audio_data))
|
202 |
+
if max_amplitude < 0.01: # Very quiet audio
|
203 |
+
return "Audio too quiet, please speak louder", "neutral"
|
204 |
|
205 |
# Normalize audio
|
206 |
+
if max_amplitude > 0:
|
207 |
+
audio_data = audio_data / max_amplitude * 0.95
|
|
|
|
|
208 |
|
209 |
+
# Resample to 16kHz if needed (Whisper expects 16kHz)
|
210 |
if sample_rate != 16000:
|
211 |
+
print(f"Resampling from {sample_rate}Hz to 16000Hz...")
|
212 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
213 |
|
214 |
+
print(f"Final audio: length={len(audio_data)}, max_amplitude={np.max(np.abs(audio_data)):.3f}")
|
215 |
+
|
216 |
+
# FIXED: Call ASR pipeline without sampling_rate parameter
|
217 |
+
print("π Running Whisper ASR...")
|
218 |
+
result = asr_pipe(audio_data) # Removed sampling_rate parameter
|
219 |
+
|
220 |
transcription = result['text'].strip()
|
221 |
+
print(f"Transcription: '{transcription}'")
|
222 |
|
223 |
+
if not transcription or len(transcription) < 2:
|
224 |
+
return "No clear speech detected, please try speaking more clearly", "neutral"
|
225 |
|
226 |
# Detect emotion from transcription
|
227 |
emotion = detect_emotion_from_text(transcription)
|
228 |
+
print(f"Detected emotion: {emotion}")
|
229 |
|
230 |
return transcription, emotion
|
231 |
|
232 |
except Exception as e:
|
233 |
+
print(f"β Error in STT: {e}")
|
234 |
+
return "Sorry, I couldn't understand that. Please try again.", "neutral"
|
235 |
|
236 |
def generate_contextual_response(user_input, emotion, conversation_manager):
|
237 |
"""Enhanced response generation with better emotional intelligence"""
|
|
|
314 |
return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
|
315 |
|
316 |
def text_to_speech_emotional(text, emotion="neutral"):
|
317 |
+
"""Enhanced TTS with Dia support"""
|
318 |
try:
|
319 |
if tts_model is None:
|
320 |
print(f"π Maya says ({emotion}): {text}")
|
|
|
365 |
audio_output = audio_output / max_val * 0.95
|
366 |
|
367 |
return (44100, audio_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
|
369 |
else:
|
370 |
# Text-only mode
|
|
|
394 |
return None, "Please record some audio first.", "", "β No audio input received."
|
395 |
|
396 |
try:
|
397 |
+
print("π Processing conversation...")
|
398 |
+
|
399 |
# Step 1: Speech to Text + Emotion Detection
|
400 |
user_text, emotion = speech_to_text_with_emotion(audio_input)
|
401 |
|
402 |
+
# Check for error messages from STT
|
403 |
+
error_phrases = ["audio too short", "audio too quiet", "no clear speech", "sorry", "couldn't understand"]
|
404 |
+
if any(phrase in user_text.lower() for phrase in error_phrases):
|
405 |
+
return None, user_text, "", f"β STT Issue: {user_text}"
|
406 |
+
|
407 |
+
if not user_text or user_text.strip() == "":
|
408 |
+
return None, "I didn't catch that clearly. Could you please speak a bit louder and closer to the microphone?", "", "β No speech detected."
|
409 |
|
410 |
# Step 2: Generate contextual response
|
411 |
ai_response = generate_contextual_response(user_text, emotion, conv_manager)
|
|
|
497 |
# Audio Input
|
498 |
gr.HTML("<h3 style='color: #333; margin: 20px 0 15px 0;'>π€ Voice Input</h3>")
|
499 |
audio_input = gr.Audio(
|
500 |
+
label="Record Your Message (Speak clearly for at least 2 seconds)",
|
501 |
sources=["microphone"],
|
502 |
type="numpy",
|
503 |
format="wav"
|
|
|
582 |
<h4 style="color: #007bff;">π Getting Started:</h4>
|
583 |
<ol style="color: #495057;">
|
584 |
<li><strong>Start Call:</strong> Click "π Start Call" to initialize Maya</li>
|
585 |
+
<li><strong>Record:</strong> Speak clearly for at least 2 seconds</li>
|
586 |
<li><strong>Process:</strong> Click "π― Process Message" to get Maya's response</li>
|
587 |
<li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
|
588 |
<li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
|
|
|
590 |
</ol>
|
591 |
</div>
|
592 |
<div>
|
593 |
+
<h4 style="color: #28a745;">π Features:</h4>
|
594 |
<ul style="color: #495057;">
|
595 |
<li>π€ <strong>Speech Recognition:</strong> Powered by OpenAI Whisper</li>
|
596 |
<li>π§ <strong>Smart Responses:</strong> Using Qwen2.5-1.5B LLM</li>
|
597 |
+
<li>π <strong>Emotion Detection:</strong> Advanced emotion recognition</li>
|
598 |
+
<li>π <strong>Natural TTS:</strong> High-quality Dia TTS synthesis</li>
|
599 |
+
<li>π <strong>Context Memory:</strong> Remembers conversation flow</li>
|
600 |
+
<li>β€οΈ <strong>Emotional Intelligence:</strong> Responds to your emotions</li>
|
601 |
</ul>
|
602 |
</div>
|
603 |
</div>
|
604 |
|
605 |
<div style="margin-top: 20px; padding: 15px; background: #d1ecf1; border-radius: 8px; border-left: 4px solid #bee5eb;">
|
606 |
+
<p style="margin: 0; color: #0c5460;"><strong>π‘ Pro Tips:</strong></p>
|
607 |
+
<ul style="color: #0c5460; margin: 10px 0;">
|
608 |
+
<li>Speak clearly and close to your microphone</li>
|
609 |
+
<li>Record for at least 2-3 seconds</li>
|
610 |
+
<li>Speak in a quiet environment for best results</li>
|
611 |
+
<li>Maya can detect emotions and respond accordingly!</li>
|
612 |
+
</ul>
|
613 |
</div>
|
614 |
</div>
|
615 |
""")
|