Update app.py
Browse files
app.py
CHANGED
@@ -4,33 +4,27 @@ import numpy as np
|
|
4 |
import librosa
|
5 |
import soundfile as sf
|
6 |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
7 |
-
from dia.model import Dia
|
8 |
import warnings
|
9 |
import json
|
10 |
import time
|
11 |
from datetime import datetime
|
12 |
import os
|
13 |
|
|
|
14 |
try:
|
15 |
-
from
|
16 |
-
|
17 |
except ImportError:
|
18 |
-
print("β οΈ
|
19 |
-
|
20 |
-
|
21 |
-
warnings.filterwarnings("ignore")
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
|
27 |
warnings.filterwarnings("ignore")
|
28 |
|
29 |
# Global models
|
30 |
-
|
31 |
qwen_model = None
|
32 |
qwen_tokenizer = None
|
33 |
-
|
34 |
conversation_history = []
|
35 |
|
36 |
class ConversationManager:
|
@@ -47,13 +41,12 @@ class ConversationManager:
|
|
47 |
"emotion": emotion
|
48 |
})
|
49 |
|
50 |
-
# Keep only last max_exchanges
|
51 |
if len(self.history) > self.max_exchanges:
|
52 |
self.history = self.history[-self.max_exchanges:]
|
53 |
|
54 |
def get_context(self):
|
55 |
context = ""
|
56 |
-
for exchange in self.history[-3:]:
|
57 |
context += f"User: {exchange['user']}\nAI: {exchange['ai']}\n"
|
58 |
return context
|
59 |
|
@@ -62,32 +55,37 @@ class ConversationManager:
|
|
62 |
self.current_emotion = "neutral"
|
63 |
|
64 |
def load_models():
|
65 |
-
"""Load all models with
|
66 |
-
global
|
|
|
|
|
67 |
|
68 |
-
|
|
|
69 |
try:
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
torch_dtype=torch.float16,
|
74 |
-
|
75 |
)
|
76 |
-
print("β
|
77 |
except Exception as e:
|
78 |
-
print(f"β Error loading
|
79 |
return False
|
80 |
|
|
|
81 |
print("π§ Loading Qwen2.5-1.5B for conversation...")
|
82 |
try:
|
|
|
83 |
qwen_tokenizer = AutoTokenizer.from_pretrained(
|
84 |
-
|
85 |
trust_remote_code=True
|
86 |
)
|
87 |
qwen_model = AutoModelForCausalLM.from_pretrained(
|
88 |
-
|
89 |
-
torch_dtype=torch.float16,
|
90 |
-
device_map="auto",
|
91 |
trust_remote_code=True
|
92 |
)
|
93 |
print("β
Qwen loaded successfully!")
|
@@ -95,52 +93,39 @@ def load_models():
|
|
95 |
print(f"β Error loading Qwen: {e}")
|
96 |
return False
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
return True
|
110 |
|
111 |
-
def
|
112 |
-
"""
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
]
|
127 |
-
|
128 |
-
result = ultravox_pipe({
|
129 |
-
'audio': audio_input,
|
130 |
-
'turns': turns,
|
131 |
-
'sampling_rate': 16000
|
132 |
-
}, max_new_tokens=10)
|
133 |
-
|
134 |
-
detected_emotion = result[0]['generated_text'].lower().strip()
|
135 |
-
|
136 |
-
# Validate emotion
|
137 |
-
valid_emotions = ["happy", "sad", "angry", "surprised", "neutral"]
|
138 |
-
if detected_emotion not in valid_emotions:
|
139 |
-
detected_emotion = "neutral"
|
140 |
-
|
141 |
-
return detected_emotion
|
142 |
-
except:
|
143 |
-
return "neutral"
|
144 |
|
145 |
def speech_to_text_with_emotion(audio_input):
|
146 |
"""Convert speech to text and detect emotion"""
|
@@ -148,35 +133,34 @@ def speech_to_text_with_emotion(audio_input):
|
|
148 |
if audio_input is None:
|
149 |
return "", "neutral"
|
150 |
|
151 |
-
#
|
152 |
if isinstance(audio_input, tuple):
|
153 |
sample_rate, audio_data = audio_input
|
154 |
-
|
|
|
|
|
155 |
if len(audio_data.shape) > 1:
|
156 |
audio_data = audio_data.mean(axis=1)
|
157 |
else:
|
158 |
audio_data = audio_input
|
159 |
sample_rate = 16000
|
160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
# Resample to 16kHz if needed
|
162 |
if sample_rate != 16000:
|
163 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
164 |
|
165 |
-
# Speech to text
|
166 |
-
|
167 |
-
|
168 |
-
]
|
169 |
-
|
170 |
-
result = ultravox_pipe({
|
171 |
-
'audio': audio_data,
|
172 |
-
'turns': turns,
|
173 |
-
'sampling_rate': 16000
|
174 |
-
}, max_new_tokens=100)
|
175 |
-
|
176 |
-
transcription = result[0]['generated_text'].strip()
|
177 |
|
178 |
-
# Detect emotion
|
179 |
-
emotion =
|
180 |
|
181 |
return transcription, emotion
|
182 |
|
@@ -189,13 +173,13 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
|
|
189 |
try:
|
190 |
context = conversation_manager.get_context()
|
191 |
|
192 |
-
# Emotional
|
193 |
emotional_prompts = {
|
194 |
-
"happy": "Respond with enthusiasm and joy. Use
|
195 |
-
"sad": "Respond with empathy and comfort. Be gentle and
|
196 |
-
"angry": "Respond calmly and try to
|
197 |
-
"surprised": "Share in the surprise and
|
198 |
-
"neutral": "Respond naturally and conversationally."
|
199 |
}
|
200 |
|
201 |
system_prompt = f"""You are Maya, a friendly and emotionally intelligent AI assistant.
|
@@ -207,10 +191,11 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
|
|
207 |
Current user emotion: {emotion}
|
208 |
|
209 |
Guidelines:
|
210 |
-
- Keep responses concise (1-2 sentences)
|
211 |
-
- Match the user's emotional tone
|
212 |
- Be natural and conversational
|
213 |
-
-
|
|
|
214 |
"""
|
215 |
|
216 |
messages = [
|
@@ -225,14 +210,17 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
|
|
225 |
add_generation_prompt=True
|
226 |
)
|
227 |
|
228 |
-
model_inputs = qwen_tokenizer([text], return_tensors="pt")
|
|
|
|
|
229 |
|
230 |
with torch.no_grad():
|
231 |
generated_ids = qwen_model.generate(
|
232 |
model_inputs.input_ids,
|
233 |
-
max_new_tokens=
|
234 |
do_sample=True,
|
235 |
temperature=0.7,
|
|
|
236 |
pad_token_id=qwen_tokenizer.eos_token_id
|
237 |
)
|
238 |
|
@@ -246,46 +234,40 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
|
|
246 |
|
247 |
except Exception as e:
|
248 |
print(f"Error in response generation: {e}")
|
249 |
-
return "I'm sorry, I'm having trouble processing that right now."
|
250 |
|
251 |
-
def text_to_speech_emotional(text, emotion="neutral"
|
252 |
-
"""Convert text to
|
253 |
try:
|
|
|
|
|
|
|
|
|
254 |
# Clear GPU cache
|
255 |
if torch.cuda.is_available():
|
256 |
torch.cuda.empty_cache()
|
257 |
|
258 |
-
#
|
259 |
-
|
260 |
-
"happy": "
|
261 |
-
"sad": "
|
262 |
-
"angry": "
|
263 |
-
"surprised": "
|
264 |
"neutral": ""
|
265 |
}
|
266 |
|
267 |
-
|
268 |
-
enhanced_text = f"[{speaker}] {emotional_markers.get(emotion, '')}{text}"
|
269 |
-
|
270 |
-
# Add natural breathing pauses for longer text
|
271 |
-
if len(text) > 50:
|
272 |
-
enhanced_text = enhanced_text.replace(". ", ". (pause) ")
|
273 |
-
enhanced_text = enhanced_text.replace("! ", "! (pause) ")
|
274 |
-
enhanced_text = enhanced_text.replace("? ", "? (pause) ")
|
275 |
|
276 |
-
print(f"Generating TTS for: {enhanced_text
|
277 |
|
278 |
# Generate audio
|
279 |
-
|
280 |
-
audio_output = dia_model.generate(
|
281 |
-
enhanced_text,
|
282 |
-
use_torch_compile=False,
|
283 |
-
verbose=False
|
284 |
-
)
|
285 |
|
286 |
-
#
|
287 |
-
if isinstance(audio_output,
|
288 |
-
audio_output =
|
|
|
|
|
289 |
|
290 |
# Normalize audio
|
291 |
if len(audio_output) > 0:
|
@@ -293,10 +275,11 @@ def text_to_speech_emotional(text, emotion="neutral", speaker="S1"):
|
|
293 |
if max_val > 1.0:
|
294 |
audio_output = audio_output / max_val * 0.95
|
295 |
|
296 |
-
return (
|
297 |
|
298 |
except Exception as e:
|
299 |
print(f"Error in TTS: {e}")
|
|
|
300 |
return None
|
301 |
|
302 |
# Initialize conversation manager
|
@@ -308,19 +291,19 @@ def start_call():
|
|
308 |
greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
|
309 |
greeting_audio = text_to_speech_emotional(greeting_text, "happy")
|
310 |
|
311 |
-
return greeting_audio, greeting_text, "Call started! π"
|
312 |
|
313 |
def process_conversation(audio_input):
|
314 |
"""Main conversation processing pipeline"""
|
315 |
if audio_input is None:
|
316 |
-
return None, "Please record some audio first.", "", "No audio input received."
|
317 |
|
318 |
try:
|
319 |
# Step 1: Speech to Text + Emotion Detection
|
320 |
user_text, emotion = speech_to_text_with_emotion(audio_input)
|
321 |
|
322 |
if not user_text or user_text.strip() == "":
|
323 |
-
return None, "I didn't catch that. Could you please repeat?", "", "No speech detected."
|
324 |
|
325 |
# Step 2: Generate contextual response
|
326 |
ai_response = generate_contextual_response(user_text, emotion, conv_manager)
|
@@ -331,7 +314,7 @@ def process_conversation(audio_input):
|
|
331 |
# Step 4: Update conversation history
|
332 |
conv_manager.add_exchange(user_text, ai_response, emotion)
|
333 |
|
334 |
-
status = f"β
Processed | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
|
335 |
|
336 |
return response_audio, ai_response, user_text, status
|
337 |
|
@@ -342,7 +325,7 @@ def process_conversation(audio_input):
|
|
342 |
def get_conversation_history():
|
343 |
"""Return formatted conversation history"""
|
344 |
if not conv_manager.history:
|
345 |
-
return "No conversation history yet."
|
346 |
|
347 |
history_text = "π **Conversation History:**\n\n"
|
348 |
for i, exchange in enumerate(conv_manager.history, 1):
|
@@ -355,26 +338,33 @@ def get_conversation_history():
|
|
355 |
|
356 |
def end_call():
|
357 |
"""End call and clear conversation"""
|
358 |
-
farewell_text = "Thank you for talking with me! Have a
|
359 |
farewell_audio = text_to_speech_emotional(farewell_text, "happy")
|
360 |
conv_manager.clear()
|
361 |
|
362 |
-
return farewell_audio, farewell_text, "Call ended. πβ"
|
363 |
|
364 |
-
# Create Gradio Interface
|
365 |
def create_interface():
|
|
|
366 |
with gr.Blocks(
|
367 |
-
title="Maya AI -
|
368 |
theme=gr.themes.Soft(),
|
369 |
css="""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
.call-button { background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important; }
|
371 |
-
.
|
372 |
.end-button { background: linear-gradient(45deg, #FFA07A, #FF6347) !important; }
|
373 |
"""
|
374 |
) as demo:
|
375 |
|
376 |
gr.HTML("""
|
377 |
-
<div
|
378 |
<h1 style="color: white; margin: 0; font-size: 2.5em;">ποΈ Maya AI</h1>
|
379 |
<p style="color: white; margin: 10px 0; font-size: 1.2em;">Advanced Speech-to-Speech Conversational AI</p>
|
380 |
<p style="color: #E8E8E8; margin: 0;">Natural β’ Emotional β’ Contextual</p>
|
@@ -393,17 +383,17 @@ def create_interface():
|
|
393 |
audio_input = gr.Audio(
|
394 |
label="Record Your Message",
|
395 |
sources=["microphone"],
|
396 |
-
type="numpy"
|
397 |
-
elem_classes="record-button"
|
398 |
)
|
399 |
|
400 |
-
process_btn = gr.Button("π― Process Message", variant="primary", size="lg")
|
401 |
|
402 |
-
# Status
|
403 |
status_display = gr.Textbox(
|
404 |
label="π Status",
|
405 |
interactive=False,
|
406 |
-
lines=2
|
|
|
407 |
)
|
408 |
|
409 |
with gr.Column(scale=2):
|
@@ -421,23 +411,25 @@ def create_interface():
|
|
421 |
user_text_display = gr.Textbox(
|
422 |
label="π€ What You Said",
|
423 |
interactive=False,
|
424 |
-
lines=3
|
|
|
425 |
)
|
426 |
|
427 |
with gr.Column():
|
428 |
ai_text_display = gr.Textbox(
|
429 |
label="π€ Maya's Response",
|
430 |
interactive=False,
|
431 |
-
lines=3
|
|
|
432 |
)
|
433 |
|
434 |
-
# Conversation History
|
435 |
with gr.Row():
|
436 |
with gr.Column():
|
437 |
gr.HTML("<h3>π Conversation History</h3>")
|
438 |
history_btn = gr.Button("π Show History", variant="secondary")
|
439 |
history_display = gr.Markdown(
|
440 |
-
value="No conversation history yet.",
|
441 |
label="Conversation Log"
|
442 |
)
|
443 |
|
@@ -463,21 +455,27 @@ def create_interface():
|
|
463 |
outputs=[history_display]
|
464 |
)
|
465 |
|
466 |
-
#
|
467 |
gr.HTML("""
|
468 |
<div style="margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 10px; border-left: 5px solid #007bff;">
|
469 |
<h3>π‘ How to Use Maya AI:</h3>
|
470 |
<ol>
|
471 |
-
<li><strong>Start Call:</strong> Click "π Start Call" to
|
472 |
<li><strong>Record:</strong> Use the microphone to record your message</li>
|
473 |
<li><strong>Process:</strong> Click "π― Process Message" to get Maya's response</li>
|
474 |
<li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
|
475 |
-
<li><strong>Continue:</strong> Keep
|
476 |
<li><strong>End:</strong> Click "πβ End Call" when finished</li>
|
477 |
</ol>
|
478 |
|
479 |
-
<h4>π
|
480 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
481 |
</div>
|
482 |
""")
|
483 |
|
@@ -485,6 +483,13 @@ def create_interface():
|
|
485 |
|
486 |
if __name__ == "__main__":
|
487 |
print("π Initializing Maya AI System...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
488 |
|
489 |
if load_models():
|
490 |
print("β
All models loaded successfully!")
|
@@ -495,7 +500,8 @@ if __name__ == "__main__":
|
|
495 |
server_name="0.0.0.0",
|
496 |
server_port=7860,
|
497 |
share=True,
|
498 |
-
show_error=True
|
|
|
499 |
)
|
500 |
else:
|
501 |
-
print("β Failed to load models. Please check
|
|
|
4 |
import librosa
|
5 |
import soundfile as sf
|
6 |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
|
|
7 |
import warnings
|
8 |
import json
|
9 |
import time
|
10 |
from datetime import datetime
|
11 |
import os
|
12 |
|
13 |
+
# Import TTS with fallback
|
14 |
try:
|
15 |
+
from TTS.api import TTS
|
16 |
+
TTS_AVAILABLE = True
|
17 |
except ImportError:
|
18 |
+
print("β οΈ TTS not available, using text-only mode")
|
19 |
+
TTS_AVAILABLE = False
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
warnings.filterwarnings("ignore")
|
22 |
|
23 |
# Global models
|
24 |
+
asr_pipe = None
|
25 |
qwen_model = None
|
26 |
qwen_tokenizer = None
|
27 |
+
tts_model = None
|
28 |
conversation_history = []
|
29 |
|
30 |
class ConversationManager:
|
|
|
41 |
"emotion": emotion
|
42 |
})
|
43 |
|
|
|
44 |
if len(self.history) > self.max_exchanges:
|
45 |
self.history = self.history[-self.max_exchanges:]
|
46 |
|
47 |
def get_context(self):
|
48 |
context = ""
|
49 |
+
for exchange in self.history[-3:]:
|
50 |
context += f"User: {exchange['user']}\nAI: {exchange['ai']}\n"
|
51 |
return context
|
52 |
|
|
|
55 |
self.current_emotion = "neutral"
|
56 |
|
57 |
def load_models():
|
58 |
+
"""Load all models with proper error handling"""
|
59 |
+
global asr_pipe, qwen_model, qwen_tokenizer, tts_model
|
60 |
+
|
61 |
+
print("π Loading models...")
|
62 |
|
63 |
+
# Load ASR model
|
64 |
+
print("π€ Loading Whisper for ASR...")
|
65 |
try:
|
66 |
+
asr_pipe = pipeline(
|
67 |
+
"automatic-speech-recognition",
|
68 |
+
model="openai/whisper-base",
|
69 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
70 |
+
device=0 if torch.cuda.is_available() else -1
|
71 |
)
|
72 |
+
print("β
Whisper ASR loaded successfully!")
|
73 |
except Exception as e:
|
74 |
+
print(f"β Error loading Whisper: {e}")
|
75 |
return False
|
76 |
|
77 |
+
# Load Qwen model
|
78 |
print("π§ Loading Qwen2.5-1.5B for conversation...")
|
79 |
try:
|
80 |
+
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
|
81 |
qwen_tokenizer = AutoTokenizer.from_pretrained(
|
82 |
+
model_name,
|
83 |
trust_remote_code=True
|
84 |
)
|
85 |
qwen_model = AutoModelForCausalLM.from_pretrained(
|
86 |
+
model_name,
|
87 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
88 |
+
device_map="auto" if torch.cuda.is_available() else None,
|
89 |
trust_remote_code=True
|
90 |
)
|
91 |
print("β
Qwen loaded successfully!")
|
|
|
93 |
print(f"β Error loading Qwen: {e}")
|
94 |
return False
|
95 |
|
96 |
+
# Load TTS model
|
97 |
+
print("ποΈ Loading TTS model...")
|
98 |
+
if TTS_AVAILABLE:
|
99 |
+
try:
|
100 |
+
# Use Coqui TTS with a good female voice
|
101 |
+
tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
102 |
+
if torch.cuda.is_available():
|
103 |
+
tts_model = tts_model.to("cuda")
|
104 |
+
print("β
TTS loaded successfully!")
|
105 |
+
except Exception as e:
|
106 |
+
print(f"β οΈ TTS failed to load: {e}")
|
107 |
+
tts_model = None
|
108 |
+
else:
|
109 |
+
print("β οΈ TTS not available, using text-only mode")
|
110 |
+
tts_model = None
|
111 |
|
112 |
return True
|
113 |
|
114 |
+
def detect_emotion_from_text(text):
|
115 |
+
"""Simple emotion detection from text"""
|
116 |
+
text_lower = text.lower()
|
117 |
+
|
118 |
+
# Emotion keywords
|
119 |
+
if any(word in text_lower for word in ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing', 'fantastic']):
|
120 |
+
return 'happy'
|
121 |
+
elif any(word in text_lower for word in ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed']):
|
122 |
+
return 'sad'
|
123 |
+
elif any(word in text_lower for word in ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate']):
|
124 |
+
return 'angry'
|
125 |
+
elif any(word in text_lower for word in ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking']):
|
126 |
+
return 'surprised'
|
127 |
+
else:
|
128 |
+
return 'neutral'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
def speech_to_text_with_emotion(audio_input):
|
131 |
"""Convert speech to text and detect emotion"""
|
|
|
133 |
if audio_input is None:
|
134 |
return "", "neutral"
|
135 |
|
136 |
+
# Process audio input
|
137 |
if isinstance(audio_input, tuple):
|
138 |
sample_rate, audio_data = audio_input
|
139 |
+
# Convert to float32 and handle stereo
|
140 |
+
if audio_data.dtype != np.float32:
|
141 |
+
audio_data = audio_data.astype(np.float32)
|
142 |
if len(audio_data.shape) > 1:
|
143 |
audio_data = audio_data.mean(axis=1)
|
144 |
else:
|
145 |
audio_data = audio_input
|
146 |
sample_rate = 16000
|
147 |
|
148 |
+
# Normalize audio
|
149 |
+
if len(audio_data) > 0:
|
150 |
+
max_val = np.max(np.abs(audio_data))
|
151 |
+
if max_val > 0:
|
152 |
+
audio_data = audio_data / max_val
|
153 |
+
|
154 |
# Resample to 16kHz if needed
|
155 |
if sample_rate != 16000:
|
156 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
157 |
|
158 |
+
# Speech to text
|
159 |
+
result = asr_pipe(audio_data, sampling_rate=16000)
|
160 |
+
transcription = result['text'].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
+
# Detect emotion from transcription
|
163 |
+
emotion = detect_emotion_from_text(transcription)
|
164 |
|
165 |
return transcription, emotion
|
166 |
|
|
|
173 |
try:
|
174 |
context = conversation_manager.get_context()
|
175 |
|
176 |
+
# Emotional response styles
|
177 |
emotional_prompts = {
|
178 |
+
"happy": "Respond with enthusiasm and joy. Use positive language and show excitement.",
|
179 |
+
"sad": "Respond with empathy and comfort. Be gentle, understanding, and supportive.",
|
180 |
+
"angry": "Respond calmly and try to help. Be patient and de-escalate the situation.",
|
181 |
+
"surprised": "Share in the surprise and show curiosity. Be engaging and interested.",
|
182 |
+
"neutral": "Respond naturally and conversationally. Be helpful and friendly."
|
183 |
}
|
184 |
|
185 |
system_prompt = f"""You are Maya, a friendly and emotionally intelligent AI assistant.
|
|
|
191 |
Current user emotion: {emotion}
|
192 |
|
193 |
Guidelines:
|
194 |
+
- Keep responses concise (1-2 sentences maximum)
|
195 |
+
- Match the user's emotional tone appropriately
|
196 |
- Be natural and conversational
|
197 |
+
- Show empathy and understanding
|
198 |
+
- Provide helpful responses
|
199 |
"""
|
200 |
|
201 |
messages = [
|
|
|
210 |
add_generation_prompt=True
|
211 |
)
|
212 |
|
213 |
+
model_inputs = qwen_tokenizer([text], return_tensors="pt")
|
214 |
+
if torch.cuda.is_available():
|
215 |
+
model_inputs = model_inputs.to(qwen_model.device)
|
216 |
|
217 |
with torch.no_grad():
|
218 |
generated_ids = qwen_model.generate(
|
219 |
model_inputs.input_ids,
|
220 |
+
max_new_tokens=80,
|
221 |
do_sample=True,
|
222 |
temperature=0.7,
|
223 |
+
top_p=0.9,
|
224 |
pad_token_id=qwen_tokenizer.eos_token_id
|
225 |
)
|
226 |
|
|
|
234 |
|
235 |
except Exception as e:
|
236 |
print(f"Error in response generation: {e}")
|
237 |
+
return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
|
238 |
|
239 |
+
def text_to_speech_emotional(text, emotion="neutral"):
|
240 |
+
"""Convert text to speech with emotional context"""
|
241 |
try:
|
242 |
+
if tts_model is None:
|
243 |
+
print(f"π Maya says ({emotion}): {text}")
|
244 |
+
return None
|
245 |
+
|
246 |
# Clear GPU cache
|
247 |
if torch.cuda.is_available():
|
248 |
torch.cuda.empty_cache()
|
249 |
|
250 |
+
# Add emotional context to text
|
251 |
+
emotional_prefixes = {
|
252 |
+
"happy": "[Speaking with joy] ",
|
253 |
+
"sad": "[Speaking gently] ",
|
254 |
+
"angry": "[Speaking calmly] ",
|
255 |
+
"surprised": "[Speaking with excitement] ",
|
256 |
"neutral": ""
|
257 |
}
|
258 |
|
259 |
+
enhanced_text = f"{emotional_prefixes.get(emotion, '')}{text}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
|
261 |
+
print(f"Generating TTS for: {enhanced_text}")
|
262 |
|
263 |
# Generate audio
|
264 |
+
audio_output = tts_model.tts(text=enhanced_text)
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
+
# Convert to numpy array if needed
|
267 |
+
if isinstance(audio_output, list):
|
268 |
+
audio_output = np.array(audio_output, dtype=np.float32)
|
269 |
+
elif torch.is_tensor(audio_output):
|
270 |
+
audio_output = audio_output.cpu().numpy().astype(np.float32)
|
271 |
|
272 |
# Normalize audio
|
273 |
if len(audio_output) > 0:
|
|
|
275 |
if max_val > 1.0:
|
276 |
audio_output = audio_output / max_val * 0.95
|
277 |
|
278 |
+
return (22050, audio_output) # Return sample rate and audio data
|
279 |
|
280 |
except Exception as e:
|
281 |
print(f"Error in TTS: {e}")
|
282 |
+
print(f"π Maya says ({emotion}): {text}")
|
283 |
return None
|
284 |
|
285 |
# Initialize conversation manager
|
|
|
291 |
greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
|
292 |
greeting_audio = text_to_speech_emotional(greeting_text, "happy")
|
293 |
|
294 |
+
return greeting_audio, greeting_text, "Call started! π Ready to chat!"
|
295 |
|
296 |
def process_conversation(audio_input):
|
297 |
"""Main conversation processing pipeline"""
|
298 |
if audio_input is None:
|
299 |
+
return None, "Please record some audio first.", "", "β No audio input received."
|
300 |
|
301 |
try:
|
302 |
# Step 1: Speech to Text + Emotion Detection
|
303 |
user_text, emotion = speech_to_text_with_emotion(audio_input)
|
304 |
|
305 |
if not user_text or user_text.strip() == "":
|
306 |
+
return None, "I didn't catch that. Could you please repeat?", "", "β No speech detected."
|
307 |
|
308 |
# Step 2: Generate contextual response
|
309 |
ai_response = generate_contextual_response(user_text, emotion, conv_manager)
|
|
|
314 |
# Step 4: Update conversation history
|
315 |
conv_manager.add_exchange(user_text, ai_response, emotion)
|
316 |
|
317 |
+
status = f"β
Processed successfully! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
|
318 |
|
319 |
return response_audio, ai_response, user_text, status
|
320 |
|
|
|
325 |
def get_conversation_history():
|
326 |
"""Return formatted conversation history"""
|
327 |
if not conv_manager.history:
|
328 |
+
return "No conversation history yet. Start a call to begin chatting!"
|
329 |
|
330 |
history_text = "π **Conversation History:**\n\n"
|
331 |
for i, exchange in enumerate(conv_manager.history, 1):
|
|
|
338 |
|
339 |
def end_call():
|
340 |
"""End call and clear conversation"""
|
341 |
+
farewell_text = "Thank you for talking with me! Have a wonderful day!"
|
342 |
farewell_audio = text_to_speech_emotional(farewell_text, "happy")
|
343 |
conv_manager.clear()
|
344 |
|
345 |
+
return farewell_audio, farewell_text, "Call ended. πβ Thanks for chatting!"
|
346 |
|
|
|
347 |
def create_interface():
|
348 |
+
"""Create the Gradio interface"""
|
349 |
with gr.Blocks(
|
350 |
+
title="Maya AI - Speech-to-Speech Assistant",
|
351 |
theme=gr.themes.Soft(),
|
352 |
css="""
|
353 |
+
.main-header {
|
354 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
355 |
+
border-radius: 15px;
|
356 |
+
padding: 20px;
|
357 |
+
text-align: center;
|
358 |
+
margin-bottom: 20px;
|
359 |
+
}
|
360 |
.call-button { background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important; }
|
361 |
+
.process-button { background: linear-gradient(45deg, #45B7D1, #96CEB4) !important; }
|
362 |
.end-button { background: linear-gradient(45deg, #FFA07A, #FF6347) !important; }
|
363 |
"""
|
364 |
) as demo:
|
365 |
|
366 |
gr.HTML("""
|
367 |
+
<div class="main-header">
|
368 |
<h1 style="color: white; margin: 0; font-size: 2.5em;">ποΈ Maya AI</h1>
|
369 |
<p style="color: white; margin: 10px 0; font-size: 1.2em;">Advanced Speech-to-Speech Conversational AI</p>
|
370 |
<p style="color: #E8E8E8; margin: 0;">Natural β’ Emotional β’ Contextual</p>
|
|
|
383 |
audio_input = gr.Audio(
|
384 |
label="Record Your Message",
|
385 |
sources=["microphone"],
|
386 |
+
type="numpy"
|
|
|
387 |
)
|
388 |
|
389 |
+
process_btn = gr.Button("π― Process Message", elem_classes="process-button", variant="primary", size="lg")
|
390 |
|
391 |
+
# Status Display
|
392 |
status_display = gr.Textbox(
|
393 |
label="π Status",
|
394 |
interactive=False,
|
395 |
+
lines=2,
|
396 |
+
value="Ready to start! Click 'Start Call' to begin."
|
397 |
)
|
398 |
|
399 |
with gr.Column(scale=2):
|
|
|
411 |
user_text_display = gr.Textbox(
|
412 |
label="π€ What You Said",
|
413 |
interactive=False,
|
414 |
+
lines=3,
|
415 |
+
placeholder="Your speech will appear here..."
|
416 |
)
|
417 |
|
418 |
with gr.Column():
|
419 |
ai_text_display = gr.Textbox(
|
420 |
label="π€ Maya's Response",
|
421 |
interactive=False,
|
422 |
+
lines=3,
|
423 |
+
placeholder="Maya's response will appear here..."
|
424 |
)
|
425 |
|
426 |
+
# Conversation History Section
|
427 |
with gr.Row():
|
428 |
with gr.Column():
|
429 |
gr.HTML("<h3>π Conversation History</h3>")
|
430 |
history_btn = gr.Button("π Show History", variant="secondary")
|
431 |
history_display = gr.Markdown(
|
432 |
+
value="No conversation history yet. Start a call to begin chatting!",
|
433 |
label="Conversation Log"
|
434 |
)
|
435 |
|
|
|
455 |
outputs=[history_display]
|
456 |
)
|
457 |
|
458 |
+
# Instructions
|
459 |
gr.HTML("""
|
460 |
<div style="margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 10px; border-left: 5px solid #007bff;">
|
461 |
<h3>π‘ How to Use Maya AI:</h3>
|
462 |
<ol>
|
463 |
+
<li><strong>Start Call:</strong> Click "π Start Call" to initialize Maya</li>
|
464 |
<li><strong>Record:</strong> Use the microphone to record your message</li>
|
465 |
<li><strong>Process:</strong> Click "π― Process Message" to get Maya's response</li>
|
466 |
<li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
|
467 |
+
<li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
|
468 |
<li><strong>End:</strong> Click "πβ End Call" when finished</li>
|
469 |
</ol>
|
470 |
|
471 |
+
<h4>π Features:</h4>
|
472 |
+
<ul>
|
473 |
+
<li>π€ <strong>Speech Recognition:</strong> Powered by Whisper</li>
|
474 |
+
<li>π§ <strong>Smart Responses:</strong> Using Qwen2.5-1.5B</li>
|
475 |
+
<li>π <strong>Emotion Detection:</strong> Automatic emotion recognition</li>
|
476 |
+
<li>π <strong>Natural Speech:</strong> High-quality TTS with emotions</li>
|
477 |
+
<li>π <strong>Context Memory:</strong> Remembers conversation flow</li>
|
478 |
+
</ul>
|
479 |
</div>
|
480 |
""")
|
481 |
|
|
|
483 |
|
484 |
if __name__ == "__main__":
|
485 |
print("π Initializing Maya AI System...")
|
486 |
+
print("π§ Checking GPU availability...")
|
487 |
+
|
488 |
+
if torch.cuda.is_available():
|
489 |
+
print(f"β
GPU detected: {torch.cuda.get_device_name()}")
|
490 |
+
print(f"πΎ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
|
491 |
+
else:
|
492 |
+
print("β οΈ No GPU detected, using CPU")
|
493 |
|
494 |
if load_models():
|
495 |
print("β
All models loaded successfully!")
|
|
|
500 |
server_name="0.0.0.0",
|
501 |
server_port=7860,
|
502 |
share=True,
|
503 |
+
show_error=True,
|
504 |
+
debug=False
|
505 |
)
|
506 |
else:
|
507 |
+
print("β Failed to load models. Please check the logs above for details.")
|