Devakumar868 commited on
Commit
ab6af92
Β·
verified Β·
1 Parent(s): 20ec756

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -154
app.py CHANGED
@@ -4,33 +4,27 @@ import numpy as np
4
  import librosa
5
  import soundfile as sf
6
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
7
- from dia.model import Dia
8
  import warnings
9
  import json
10
  import time
11
  from datetime import datetime
12
  import os
13
 
 
14
  try:
15
- from nari_tts import Dia
16
- DIA_AVAILABLE = True
17
  except ImportError:
18
- print("⚠️ Dia TTS not available, using fallback TTS")
19
- DIA_AVAILABLE = False
20
-
21
- warnings.filterwarnings("ignore")
22
-
23
-
24
-
25
-
26
 
27
  warnings.filterwarnings("ignore")
28
 
29
  # Global models
30
- ultravox_pipe = None
31
  qwen_model = None
32
  qwen_tokenizer = None
33
- dia_model = None
34
  conversation_history = []
35
 
36
  class ConversationManager:
@@ -47,13 +41,12 @@ class ConversationManager:
47
  "emotion": emotion
48
  })
49
 
50
- # Keep only last max_exchanges
51
  if len(self.history) > self.max_exchanges:
52
  self.history = self.history[-self.max_exchanges:]
53
 
54
  def get_context(self):
55
  context = ""
56
- for exchange in self.history[-3:]: # Last 3 exchanges for context
57
  context += f"User: {exchange['user']}\nAI: {exchange['ai']}\n"
58
  return context
59
 
@@ -62,32 +55,37 @@ class ConversationManager:
62
  self.current_emotion = "neutral"
63
 
64
  def load_models():
65
- """Load all models with optimized memory usage"""
66
- global ultravox_pipe, qwen_model, qwen_tokenizer, dia_model
 
 
67
 
68
- print("πŸš€ Loading Ultravox for ASR + Emotion Recognition...")
 
69
  try:
70
- ultravox_pipe = pipeline(
71
- model='fixie-ai/ultravox-v0_4',
72
- trust_remote_code=True,
73
- torch_dtype=torch.float16,
74
- device_map="auto"
75
  )
76
- print("βœ… Ultravox loaded successfully!")
77
  except Exception as e:
78
- print(f"❌ Error loading Ultravox: {e}")
79
  return False
80
 
 
81
  print("🧠 Loading Qwen2.5-1.5B for conversation...")
82
  try:
 
83
  qwen_tokenizer = AutoTokenizer.from_pretrained(
84
- "Qwen/Qwen2.5-1.5B-Instruct",
85
  trust_remote_code=True
86
  )
87
  qwen_model = AutoModelForCausalLM.from_pretrained(
88
- "Qwen/Qwen2.5-1.5B-Instruct",
89
- torch_dtype=torch.float16,
90
- device_map="auto",
91
  trust_remote_code=True
92
  )
93
  print("βœ… Qwen loaded successfully!")
@@ -95,52 +93,39 @@ def load_models():
95
  print(f"❌ Error loading Qwen: {e}")
96
  return False
97
 
98
- print("πŸŽ™οΈ Loading Enhanced Dia TTS...")
99
- try:
100
- dia_model = Dia.from_pretrained(
101
- "nari-labs/Dia-1.6B",
102
- compute_dtype="float16"
103
- )
104
- print("βœ… Dia TTS loaded successfully!")
105
- except Exception as e:
106
- print(f"❌ Error loading Dia: {e}")
107
- return False
 
 
 
 
 
108
 
109
  return True
110
 
111
- def detect_emotion_from_speech(audio_input):
112
- """Extract emotion from speech using Ultravox understanding"""
113
- try:
114
- # Emotional keywords mapping
115
- emotion_keywords = {
116
- "happy": ["laugh", "excited", "joy", "great", "awesome", "wonderful"],
117
- "sad": ["cry", "upset", "disappointed", "sorry", "terrible"],
118
- "angry": ["mad", "furious", "annoyed", "frustrated"],
119
- "surprised": ["wow", "amazing", "incredible", "unbelievable"],
120
- "neutral": []
121
- }
122
-
123
- # Use Ultravox to understand speech context
124
- turns = [
125
- {"role": "system", "content": "Analyze the emotional tone of the user's speech. Respond with just the emotion: happy, sad, angry, surprised, or neutral."},
126
- ]
127
-
128
- result = ultravox_pipe({
129
- 'audio': audio_input,
130
- 'turns': turns,
131
- 'sampling_rate': 16000
132
- }, max_new_tokens=10)
133
-
134
- detected_emotion = result[0]['generated_text'].lower().strip()
135
-
136
- # Validate emotion
137
- valid_emotions = ["happy", "sad", "angry", "surprised", "neutral"]
138
- if detected_emotion not in valid_emotions:
139
- detected_emotion = "neutral"
140
-
141
- return detected_emotion
142
- except:
143
- return "neutral"
144
 
145
  def speech_to_text_with_emotion(audio_input):
146
  """Convert speech to text and detect emotion"""
@@ -148,35 +133,34 @@ def speech_to_text_with_emotion(audio_input):
148
  if audio_input is None:
149
  return "", "neutral"
150
 
151
- # Convert audio format if needed
152
  if isinstance(audio_input, tuple):
153
  sample_rate, audio_data = audio_input
154
- audio_data = audio_data.astype(np.float32)
 
 
155
  if len(audio_data.shape) > 1:
156
  audio_data = audio_data.mean(axis=1)
157
  else:
158
  audio_data = audio_input
159
  sample_rate = 16000
160
 
 
 
 
 
 
 
161
  # Resample to 16kHz if needed
162
  if sample_rate != 16000:
163
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
164
 
165
- # Speech to text using Ultravox
166
- turns = [
167
- {"role": "system", "content": "Transcribe the user's speech accurately. Only provide the transcription."},
168
- ]
169
-
170
- result = ultravox_pipe({
171
- 'audio': audio_data,
172
- 'turns': turns,
173
- 'sampling_rate': 16000
174
- }, max_new_tokens=100)
175
-
176
- transcription = result[0]['generated_text'].strip()
177
 
178
- # Detect emotion
179
- emotion = detect_emotion_from_speech(audio_data)
180
 
181
  return transcription, emotion
182
 
@@ -189,13 +173,13 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
189
  try:
190
  context = conversation_manager.get_context()
191
 
192
- # Emotional system prompt
193
  emotional_prompts = {
194
- "happy": "Respond with enthusiasm and joy. Use exclamations and positive language.",
195
- "sad": "Respond with empathy and comfort. Be gentle and understanding.",
196
- "angry": "Respond calmly and try to de-escalate. Be patient and helpful.",
197
- "surprised": "Share in the surprise and excitement. Be engaging and curious.",
198
- "neutral": "Respond naturally and conversationally."
199
  }
200
 
201
  system_prompt = f"""You are Maya, a friendly and emotionally intelligent AI assistant.
@@ -207,10 +191,11 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
207
  Current user emotion: {emotion}
208
 
209
  Guidelines:
210
- - Keep responses concise (1-2 sentences)
211
- - Match the user's emotional tone
212
  - Be natural and conversational
213
- - Include emotional expressions when appropriate like (laughs), (sighs), etc.
 
214
  """
215
 
216
  messages = [
@@ -225,14 +210,17 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
225
  add_generation_prompt=True
226
  )
227
 
228
- model_inputs = qwen_tokenizer([text], return_tensors="pt").to(qwen_model.device)
 
 
229
 
230
  with torch.no_grad():
231
  generated_ids = qwen_model.generate(
232
  model_inputs.input_ids,
233
- max_new_tokens=100,
234
  do_sample=True,
235
  temperature=0.7,
 
236
  pad_token_id=qwen_tokenizer.eos_token_id
237
  )
238
 
@@ -246,46 +234,40 @@ def generate_contextual_response(user_input, emotion, conversation_manager):
246
 
247
  except Exception as e:
248
  print(f"Error in response generation: {e}")
249
- return "I'm sorry, I'm having trouble processing that right now."
250
 
251
- def text_to_speech_emotional(text, emotion="neutral", speaker="S1"):
252
- """Convert text to emotional speech using enhanced Dia"""
253
  try:
 
 
 
 
254
  # Clear GPU cache
255
  if torch.cuda.is_available():
256
  torch.cuda.empty_cache()
257
 
258
- # Emotional markers for Dia
259
- emotional_markers = {
260
- "happy": "(excited) ",
261
- "sad": "(sad) ",
262
- "angry": "(frustrated) ",
263
- "surprised": "(surprised) ",
264
  "neutral": ""
265
  }
266
 
267
- # Add emotional context and natural pauses
268
- enhanced_text = f"[{speaker}] {emotional_markers.get(emotion, '')}{text}"
269
-
270
- # Add natural breathing pauses for longer text
271
- if len(text) > 50:
272
- enhanced_text = enhanced_text.replace(". ", ". (pause) ")
273
- enhanced_text = enhanced_text.replace("! ", "! (pause) ")
274
- enhanced_text = enhanced_text.replace("? ", "? (pause) ")
275
 
276
- print(f"Generating TTS for: {enhanced_text[:100]}...")
277
 
278
  # Generate audio
279
- with torch.no_grad():
280
- audio_output = dia_model.generate(
281
- enhanced_text,
282
- use_torch_compile=False,
283
- verbose=False
284
- )
285
 
286
- # Process audio output
287
- if isinstance(audio_output, torch.Tensor):
288
- audio_output = audio_output.cpu().numpy()
 
 
289
 
290
  # Normalize audio
291
  if len(audio_output) > 0:
@@ -293,10 +275,11 @@ def text_to_speech_emotional(text, emotion="neutral", speaker="S1"):
293
  if max_val > 1.0:
294
  audio_output = audio_output / max_val * 0.95
295
 
296
- return (44100, audio_output)
297
 
298
  except Exception as e:
299
  print(f"Error in TTS: {e}")
 
300
  return None
301
 
302
  # Initialize conversation manager
@@ -308,19 +291,19 @@ def start_call():
308
  greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
309
  greeting_audio = text_to_speech_emotional(greeting_text, "happy")
310
 
311
- return greeting_audio, greeting_text, "Call started! πŸ“ž"
312
 
313
  def process_conversation(audio_input):
314
  """Main conversation processing pipeline"""
315
  if audio_input is None:
316
- return None, "Please record some audio first.", "", "No audio input received."
317
 
318
  try:
319
  # Step 1: Speech to Text + Emotion Detection
320
  user_text, emotion = speech_to_text_with_emotion(audio_input)
321
 
322
  if not user_text or user_text.strip() == "":
323
- return None, "I didn't catch that. Could you please repeat?", "", "No speech detected."
324
 
325
  # Step 2: Generate contextual response
326
  ai_response = generate_contextual_response(user_text, emotion, conv_manager)
@@ -331,7 +314,7 @@ def process_conversation(audio_input):
331
  # Step 4: Update conversation history
332
  conv_manager.add_exchange(user_text, ai_response, emotion)
333
 
334
- status = f"βœ… Processed | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
335
 
336
  return response_audio, ai_response, user_text, status
337
 
@@ -342,7 +325,7 @@ def process_conversation(audio_input):
342
  def get_conversation_history():
343
  """Return formatted conversation history"""
344
  if not conv_manager.history:
345
- return "No conversation history yet."
346
 
347
  history_text = "πŸ“‹ **Conversation History:**\n\n"
348
  for i, exchange in enumerate(conv_manager.history, 1):
@@ -355,26 +338,33 @@ def get_conversation_history():
355
 
356
  def end_call():
357
  """End call and clear conversation"""
358
- farewell_text = "Thank you for talking with me! Have a great day!"
359
  farewell_audio = text_to_speech_emotional(farewell_text, "happy")
360
  conv_manager.clear()
361
 
362
- return farewell_audio, farewell_text, "Call ended. πŸ“žβŒ"
363
 
364
- # Create Gradio Interface
365
  def create_interface():
 
366
  with gr.Blocks(
367
- title="Maya AI - Advanced Speech-to-Speech Assistant",
368
  theme=gr.themes.Soft(),
369
  css="""
 
 
 
 
 
 
 
370
  .call-button { background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important; }
371
- .record-button { background: linear-gradient(45deg, #45B7D1, #96CEB4) !important; }
372
  .end-button { background: linear-gradient(45deg, #FFA07A, #FF6347) !important; }
373
  """
374
  ) as demo:
375
 
376
  gr.HTML("""
377
- <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 20px;">
378
  <h1 style="color: white; margin: 0; font-size: 2.5em;">πŸŽ™οΈ Maya AI</h1>
379
  <p style="color: white; margin: 10px 0; font-size: 1.2em;">Advanced Speech-to-Speech Conversational AI</p>
380
  <p style="color: #E8E8E8; margin: 0;">Natural β€’ Emotional β€’ Contextual</p>
@@ -393,17 +383,17 @@ def create_interface():
393
  audio_input = gr.Audio(
394
  label="Record Your Message",
395
  sources=["microphone"],
396
- type="numpy",
397
- elem_classes="record-button"
398
  )
399
 
400
- process_btn = gr.Button("🎯 Process Message", variant="primary", size="lg")
401
 
402
- # Status
403
  status_display = gr.Textbox(
404
  label="πŸ“Š Status",
405
  interactive=False,
406
- lines=2
 
407
  )
408
 
409
  with gr.Column(scale=2):
@@ -421,23 +411,25 @@ def create_interface():
421
  user_text_display = gr.Textbox(
422
  label="πŸ‘€ What You Said",
423
  interactive=False,
424
- lines=3
 
425
  )
426
 
427
  with gr.Column():
428
  ai_text_display = gr.Textbox(
429
  label="πŸ€– Maya's Response",
430
  interactive=False,
431
- lines=3
 
432
  )
433
 
434
- # Conversation History
435
  with gr.Row():
436
  with gr.Column():
437
  gr.HTML("<h3>πŸ“‹ Conversation History</h3>")
438
  history_btn = gr.Button("πŸ“‹ Show History", variant="secondary")
439
  history_display = gr.Markdown(
440
- value="No conversation history yet.",
441
  label="Conversation Log"
442
  )
443
 
@@ -463,21 +455,27 @@ def create_interface():
463
  outputs=[history_display]
464
  )
465
 
466
- # Usage Instructions
467
  gr.HTML("""
468
  <div style="margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 10px; border-left: 5px solid #007bff;">
469
  <h3>πŸ’‘ How to Use Maya AI:</h3>
470
  <ol>
471
- <li><strong>Start Call:</strong> Click "πŸ“ž Start Call" to begin your conversation</li>
472
  <li><strong>Record:</strong> Use the microphone to record your message</li>
473
  <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
474
  <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
475
- <li><strong>Continue:</strong> Keep the conversation going (up to 5 exchanges)</li>
476
  <li><strong>End:</strong> Click "πŸ“žβŒ End Call" when finished</li>
477
  </ol>
478
 
479
- <h4>🎭 Emotional Features:</h4>
480
- <p>Maya automatically detects your emotions and responds accordingly with natural expressions, breathing pauses, and contextual understanding!</p>
 
 
 
 
 
 
481
  </div>
482
  """)
483
 
@@ -485,6 +483,13 @@ def create_interface():
485
 
486
  if __name__ == "__main__":
487
  print("πŸš€ Initializing Maya AI System...")
 
 
 
 
 
 
 
488
 
489
  if load_models():
490
  print("βœ… All models loaded successfully!")
@@ -495,7 +500,8 @@ if __name__ == "__main__":
495
  server_name="0.0.0.0",
496
  server_port=7860,
497
  share=True,
498
- show_error=True
 
499
  )
500
  else:
501
- print("❌ Failed to load models. Please check your setup.")
 
4
  import librosa
5
  import soundfile as sf
6
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 
7
  import warnings
8
  import json
9
  import time
10
  from datetime import datetime
11
  import os
12
 
13
+ # Import TTS with fallback
14
  try:
15
+ from TTS.api import TTS
16
+ TTS_AVAILABLE = True
17
  except ImportError:
18
+ print("⚠️ TTS not available, using text-only mode")
19
+ TTS_AVAILABLE = False
 
 
 
 
 
 
20
 
21
  warnings.filterwarnings("ignore")
22
 
23
  # Global models
24
+ asr_pipe = None
25
  qwen_model = None
26
  qwen_tokenizer = None
27
+ tts_model = None
28
  conversation_history = []
29
 
30
  class ConversationManager:
 
41
  "emotion": emotion
42
  })
43
 
 
44
  if len(self.history) > self.max_exchanges:
45
  self.history = self.history[-self.max_exchanges:]
46
 
47
  def get_context(self):
48
  context = ""
49
+ for exchange in self.history[-3:]:
50
  context += f"User: {exchange['user']}\nAI: {exchange['ai']}\n"
51
  return context
52
 
 
55
  self.current_emotion = "neutral"
56
 
57
  def load_models():
58
+ """Load all models with proper error handling"""
59
+ global asr_pipe, qwen_model, qwen_tokenizer, tts_model
60
+
61
+ print("πŸš€ Loading models...")
62
 
63
+ # Load ASR model
64
+ print("🎀 Loading Whisper for ASR...")
65
  try:
66
+ asr_pipe = pipeline(
67
+ "automatic-speech-recognition",
68
+ model="openai/whisper-base",
69
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
70
+ device=0 if torch.cuda.is_available() else -1
71
  )
72
+ print("βœ… Whisper ASR loaded successfully!")
73
  except Exception as e:
74
+ print(f"❌ Error loading Whisper: {e}")
75
  return False
76
 
77
+ # Load Qwen model
78
  print("🧠 Loading Qwen2.5-1.5B for conversation...")
79
  try:
80
+ model_name = "Qwen/Qwen2.5-1.5B-Instruct"
81
  qwen_tokenizer = AutoTokenizer.from_pretrained(
82
+ model_name,
83
  trust_remote_code=True
84
  )
85
  qwen_model = AutoModelForCausalLM.from_pretrained(
86
+ model_name,
87
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
88
+ device_map="auto" if torch.cuda.is_available() else None,
89
  trust_remote_code=True
90
  )
91
  print("βœ… Qwen loaded successfully!")
 
93
  print(f"❌ Error loading Qwen: {e}")
94
  return False
95
 
96
+ # Load TTS model
97
+ print("πŸŽ™οΈ Loading TTS model...")
98
+ if TTS_AVAILABLE:
99
+ try:
100
+ # Use Coqui TTS with a good female voice
101
+ tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
102
+ if torch.cuda.is_available():
103
+ tts_model = tts_model.to("cuda")
104
+ print("βœ… TTS loaded successfully!")
105
+ except Exception as e:
106
+ print(f"⚠️ TTS failed to load: {e}")
107
+ tts_model = None
108
+ else:
109
+ print("⚠️ TTS not available, using text-only mode")
110
+ tts_model = None
111
 
112
  return True
113
 
114
+ def detect_emotion_from_text(text):
115
+ """Simple emotion detection from text"""
116
+ text_lower = text.lower()
117
+
118
+ # Emotion keywords
119
+ if any(word in text_lower for word in ['happy', 'great', 'awesome', 'wonderful', 'excited', 'laugh', 'amazing', 'fantastic']):
120
+ return 'happy'
121
+ elif any(word in text_lower for word in ['sad', 'upset', 'disappointed', 'cry', 'terrible', 'awful', 'depressed']):
122
+ return 'sad'
123
+ elif any(word in text_lower for word in ['angry', 'mad', 'furious', 'annoyed', 'frustrated', 'hate']):
124
+ return 'angry'
125
+ elif any(word in text_lower for word in ['wow', 'incredible', 'surprised', 'unbelievable', 'shocking']):
126
+ return 'surprised'
127
+ else:
128
+ return 'neutral'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  def speech_to_text_with_emotion(audio_input):
131
  """Convert speech to text and detect emotion"""
 
133
  if audio_input is None:
134
  return "", "neutral"
135
 
136
+ # Process audio input
137
  if isinstance(audio_input, tuple):
138
  sample_rate, audio_data = audio_input
139
+ # Convert to float32 and handle stereo
140
+ if audio_data.dtype != np.float32:
141
+ audio_data = audio_data.astype(np.float32)
142
  if len(audio_data.shape) > 1:
143
  audio_data = audio_data.mean(axis=1)
144
  else:
145
  audio_data = audio_input
146
  sample_rate = 16000
147
 
148
+ # Normalize audio
149
+ if len(audio_data) > 0:
150
+ max_val = np.max(np.abs(audio_data))
151
+ if max_val > 0:
152
+ audio_data = audio_data / max_val
153
+
154
  # Resample to 16kHz if needed
155
  if sample_rate != 16000:
156
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
157
 
158
+ # Speech to text
159
+ result = asr_pipe(audio_data, sampling_rate=16000)
160
+ transcription = result['text'].strip()
 
 
 
 
 
 
 
 
 
161
 
162
+ # Detect emotion from transcription
163
+ emotion = detect_emotion_from_text(transcription)
164
 
165
  return transcription, emotion
166
 
 
173
  try:
174
  context = conversation_manager.get_context()
175
 
176
+ # Emotional response styles
177
  emotional_prompts = {
178
+ "happy": "Respond with enthusiasm and joy. Use positive language and show excitement.",
179
+ "sad": "Respond with empathy and comfort. Be gentle, understanding, and supportive.",
180
+ "angry": "Respond calmly and try to help. Be patient and de-escalate the situation.",
181
+ "surprised": "Share in the surprise and show curiosity. Be engaging and interested.",
182
+ "neutral": "Respond naturally and conversationally. Be helpful and friendly."
183
  }
184
 
185
  system_prompt = f"""You are Maya, a friendly and emotionally intelligent AI assistant.
 
191
  Current user emotion: {emotion}
192
 
193
  Guidelines:
194
+ - Keep responses concise (1-2 sentences maximum)
195
+ - Match the user's emotional tone appropriately
196
  - Be natural and conversational
197
+ - Show empathy and understanding
198
+ - Provide helpful responses
199
  """
200
 
201
  messages = [
 
210
  add_generation_prompt=True
211
  )
212
 
213
+ model_inputs = qwen_tokenizer([text], return_tensors="pt")
214
+ if torch.cuda.is_available():
215
+ model_inputs = model_inputs.to(qwen_model.device)
216
 
217
  with torch.no_grad():
218
  generated_ids = qwen_model.generate(
219
  model_inputs.input_ids,
220
+ max_new_tokens=80,
221
  do_sample=True,
222
  temperature=0.7,
223
+ top_p=0.9,
224
  pad_token_id=qwen_tokenizer.eos_token_id
225
  )
226
 
 
234
 
235
  except Exception as e:
236
  print(f"Error in response generation: {e}")
237
+ return "I'm sorry, I'm having trouble processing that right now. Could you please try again?"
238
 
239
+ def text_to_speech_emotional(text, emotion="neutral"):
240
+ """Convert text to speech with emotional context"""
241
  try:
242
+ if tts_model is None:
243
+ print(f"πŸ”Š Maya says ({emotion}): {text}")
244
+ return None
245
+
246
  # Clear GPU cache
247
  if torch.cuda.is_available():
248
  torch.cuda.empty_cache()
249
 
250
+ # Add emotional context to text
251
+ emotional_prefixes = {
252
+ "happy": "[Speaking with joy] ",
253
+ "sad": "[Speaking gently] ",
254
+ "angry": "[Speaking calmly] ",
255
+ "surprised": "[Speaking with excitement] ",
256
  "neutral": ""
257
  }
258
 
259
+ enhanced_text = f"{emotional_prefixes.get(emotion, '')}{text}"
 
 
 
 
 
 
 
260
 
261
+ print(f"Generating TTS for: {enhanced_text}")
262
 
263
  # Generate audio
264
+ audio_output = tts_model.tts(text=enhanced_text)
 
 
 
 
 
265
 
266
+ # Convert to numpy array if needed
267
+ if isinstance(audio_output, list):
268
+ audio_output = np.array(audio_output, dtype=np.float32)
269
+ elif torch.is_tensor(audio_output):
270
+ audio_output = audio_output.cpu().numpy().astype(np.float32)
271
 
272
  # Normalize audio
273
  if len(audio_output) > 0:
 
275
  if max_val > 1.0:
276
  audio_output = audio_output / max_val * 0.95
277
 
278
+ return (22050, audio_output) # Return sample rate and audio data
279
 
280
  except Exception as e:
281
  print(f"Error in TTS: {e}")
282
+ print(f"πŸ”Š Maya says ({emotion}): {text}")
283
  return None
284
 
285
  # Initialize conversation manager
 
291
  greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
292
  greeting_audio = text_to_speech_emotional(greeting_text, "happy")
293
 
294
+ return greeting_audio, greeting_text, "Call started! πŸ“ž Ready to chat!"
295
 
296
  def process_conversation(audio_input):
297
  """Main conversation processing pipeline"""
298
  if audio_input is None:
299
+ return None, "Please record some audio first.", "", "❌ No audio input received."
300
 
301
  try:
302
  # Step 1: Speech to Text + Emotion Detection
303
  user_text, emotion = speech_to_text_with_emotion(audio_input)
304
 
305
  if not user_text or user_text.strip() == "":
306
+ return None, "I didn't catch that. Could you please repeat?", "", "❌ No speech detected."
307
 
308
  # Step 2: Generate contextual response
309
  ai_response = generate_contextual_response(user_text, emotion, conv_manager)
 
314
  # Step 4: Update conversation history
315
  conv_manager.add_exchange(user_text, ai_response, emotion)
316
 
317
+ status = f"βœ… Processed successfully! | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
318
 
319
  return response_audio, ai_response, user_text, status
320
 
 
325
  def get_conversation_history():
326
  """Return formatted conversation history"""
327
  if not conv_manager.history:
328
+ return "No conversation history yet. Start a call to begin chatting!"
329
 
330
  history_text = "πŸ“‹ **Conversation History:**\n\n"
331
  for i, exchange in enumerate(conv_manager.history, 1):
 
338
 
339
  def end_call():
340
  """End call and clear conversation"""
341
+ farewell_text = "Thank you for talking with me! Have a wonderful day!"
342
  farewell_audio = text_to_speech_emotional(farewell_text, "happy")
343
  conv_manager.clear()
344
 
345
+ return farewell_audio, farewell_text, "Call ended. πŸ“žβŒ Thanks for chatting!"
346
 
 
347
  def create_interface():
348
+ """Create the Gradio interface"""
349
  with gr.Blocks(
350
+ title="Maya AI - Speech-to-Speech Assistant",
351
  theme=gr.themes.Soft(),
352
  css="""
353
+ .main-header {
354
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
355
+ border-radius: 15px;
356
+ padding: 20px;
357
+ text-align: center;
358
+ margin-bottom: 20px;
359
+ }
360
  .call-button { background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important; }
361
+ .process-button { background: linear-gradient(45deg, #45B7D1, #96CEB4) !important; }
362
  .end-button { background: linear-gradient(45deg, #FFA07A, #FF6347) !important; }
363
  """
364
  ) as demo:
365
 
366
  gr.HTML("""
367
+ <div class="main-header">
368
  <h1 style="color: white; margin: 0; font-size: 2.5em;">πŸŽ™οΈ Maya AI</h1>
369
  <p style="color: white; margin: 10px 0; font-size: 1.2em;">Advanced Speech-to-Speech Conversational AI</p>
370
  <p style="color: #E8E8E8; margin: 0;">Natural β€’ Emotional β€’ Contextual</p>
 
383
  audio_input = gr.Audio(
384
  label="Record Your Message",
385
  sources=["microphone"],
386
+ type="numpy"
 
387
  )
388
 
389
+ process_btn = gr.Button("🎯 Process Message", elem_classes="process-button", variant="primary", size="lg")
390
 
391
+ # Status Display
392
  status_display = gr.Textbox(
393
  label="πŸ“Š Status",
394
  interactive=False,
395
+ lines=2,
396
+ value="Ready to start! Click 'Start Call' to begin."
397
  )
398
 
399
  with gr.Column(scale=2):
 
411
  user_text_display = gr.Textbox(
412
  label="πŸ‘€ What You Said",
413
  interactive=False,
414
+ lines=3,
415
+ placeholder="Your speech will appear here..."
416
  )
417
 
418
  with gr.Column():
419
  ai_text_display = gr.Textbox(
420
  label="πŸ€– Maya's Response",
421
  interactive=False,
422
+ lines=3,
423
+ placeholder="Maya's response will appear here..."
424
  )
425
 
426
+ # Conversation History Section
427
  with gr.Row():
428
  with gr.Column():
429
  gr.HTML("<h3>πŸ“‹ Conversation History</h3>")
430
  history_btn = gr.Button("πŸ“‹ Show History", variant="secondary")
431
  history_display = gr.Markdown(
432
+ value="No conversation history yet. Start a call to begin chatting!",
433
  label="Conversation Log"
434
  )
435
 
 
455
  outputs=[history_display]
456
  )
457
 
458
+ # Instructions
459
  gr.HTML("""
460
  <div style="margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 10px; border-left: 5px solid #007bff;">
461
  <h3>πŸ’‘ How to Use Maya AI:</h3>
462
  <ol>
463
+ <li><strong>Start Call:</strong> Click "πŸ“ž Start Call" to initialize Maya</li>
464
  <li><strong>Record:</strong> Use the microphone to record your message</li>
465
  <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
466
  <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
467
+ <li><strong>Continue:</strong> Keep chatting (up to 5 exchanges with context)</li>
468
  <li><strong>End:</strong> Click "πŸ“žβŒ End Call" when finished</li>
469
  </ol>
470
 
471
+ <h4>🎭 Features:</h4>
472
+ <ul>
473
+ <li>🎀 <strong>Speech Recognition:</strong> Powered by Whisper</li>
474
+ <li>🧠 <strong>Smart Responses:</strong> Using Qwen2.5-1.5B</li>
475
+ <li>🎭 <strong>Emotion Detection:</strong> Automatic emotion recognition</li>
476
+ <li>πŸ”Š <strong>Natural Speech:</strong> High-quality TTS with emotions</li>
477
+ <li>πŸ’­ <strong>Context Memory:</strong> Remembers conversation flow</li>
478
+ </ul>
479
  </div>
480
  """)
481
 
 
483
 
484
  if __name__ == "__main__":
485
  print("πŸš€ Initializing Maya AI System...")
486
+ print("πŸ”§ Checking GPU availability...")
487
+
488
+ if torch.cuda.is_available():
489
+ print(f"βœ… GPU detected: {torch.cuda.get_device_name()}")
490
+ print(f"πŸ’Ύ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
491
+ else:
492
+ print("⚠️ No GPU detected, using CPU")
493
 
494
  if load_models():
495
  print("βœ… All models loaded successfully!")
 
500
  server_name="0.0.0.0",
501
  server_port=7860,
502
  share=True,
503
+ show_error=True,
504
+ debug=False
505
  )
506
  else:
507
+ print("❌ Failed to load models. Please check the logs above for details.")