Devakumar868 commited on
Commit
7ffc610
Β·
verified Β·
1 Parent(s): f86c823

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +441 -244
app.py CHANGED
@@ -1,291 +1,488 @@
1
- import os, torch, numpy as np, soundfile as sf, gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
3
- import nemo.collections.asr as nemo_asr
4
- from TTS.api import TTS
5
- from sklearn.linear_model import LogisticRegression
6
- from datasets import load_dataset
7
- import tempfile
8
- import gc
 
 
 
 
9
 
10
- # Configuration
11
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
- SEED = 42; SAMPLE_RATE = 22050; TEMPERATURE = 0.7
13
- torch.manual_seed(SEED); np.random.seed(SEED)
14
 
15
- print(f"πŸš€ System Info:")
16
- print(f"Device: {DEVICE}")
17
- print(f"NumPy: {np.__version__}")
18
- print(f"PyTorch: {torch.__version__}")
19
- if torch.cuda.is_available():
20
- print(f"CUDA: {torch.version.cuda}")
21
 
22
- class ConversationalAI:
23
- def __init__(self):
24
- print("πŸ”„ Initializing Conversational AI...")
25
- self.setup_models()
26
- print("βœ… All models loaded successfully!")
27
-
28
- def setup_models(self):
29
- # 1. ASR: Parakeet RNNT
30
- print("πŸ“’ Loading ASR model...")
31
- try:
32
- self.asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
33
- "nvidia/parakeet-rnnt-1.1b"
34
- ).to(DEVICE).eval()
35
- print("βœ… Parakeet ASR loaded")
36
- except Exception as e:
37
- print(f"⚠️ Parakeet failed: {e}")
38
- print("πŸ”„ Loading Whisper fallback...")
39
- self.asr_pipeline = pipeline(
40
- "automatic-speech-recognition",
41
- model="openai/whisper-base.en",
42
- device=0 if DEVICE == "cuda" else -1
43
- )
44
- print("βœ… Whisper ASR loaded")
45
-
46
- # 2. SER: Emotion classifier (simplified for demo)
47
- print("🎭 Setting up emotion recognition...")
48
- X_demo = np.random.rand(100, 128)
49
- y_demo = np.random.randint(0, 5, 100) # 5 emotions: neutral, happy, sad, angry, surprised
50
- self.ser_clf = LogisticRegression().fit(X_demo, y_demo)
51
- self.emotion_labels = ["neutral", "happy", "sad", "angry", "surprised"]
52
- print("βœ… SER model ready")
53
-
54
- # 3. LLM: Conversational model
55
- print("🧠 Loading LLM...")
56
- bnb_cfg = BitsAndBytesConfig(
57
- load_in_4bit=True,
58
- bnb_4bit_compute_dtype=torch.float16,
59
- bnb_4bit_use_double_quant=True,
60
- bnb_4bit_quant_type="nf4"
61
- )
62
 
63
- model_name = "microsoft/DialoGPT-medium"
64
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
65
- self.tokenizer.pad_token = self.tokenizer.eos_token
 
 
 
 
66
 
67
- self.llm_model = AutoModelForCausalLM.from_pretrained(
68
- model_name,
69
- quantization_config=bnb_cfg,
70
- device_map="auto",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  torch_dtype=torch.float16,
72
- low_cpu_mem_usage=True
73
  )
74
- print("βœ… LLM loaded")
75
-
76
- # 4. TTS: Text-to-Speech
77
- print("πŸ—£οΈ Loading TTS...")
78
- try:
79
- self.tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
80
- print("βœ… TTS loaded")
81
- except Exception as e:
82
- print(f"⚠️ TTS error: {e}")
83
- self.tts = None
84
-
85
- # Memory cleanup
86
- if DEVICE == "cuda":
87
- torch.cuda.empty_cache()
88
- gc.collect()
89
 
90
- def transcribe(self, audio):
91
- """Convert speech to text"""
92
- try:
93
- if hasattr(self, 'asr_model'):
94
- # Use Parakeet
95
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
96
- sf.write(temp_file.name, audio[1], audio[0])
97
- transcription = self.asr_model.transcribe([temp_file.name])[0]
98
- os.unlink(temp_file.name)
99
- return transcription.text if hasattr(transcription, 'text') else str(transcription)
100
- else:
101
- # Use Whisper
102
- return self.asr_pipeline({"sampling_rate": audio[0], "raw": audio[1]})["text"]
103
- except Exception as e:
104
- print(f"ASR Error: {e}")
105
- return "Sorry, I couldn't understand the audio."
106
 
107
- def predict_emotion(self):
108
- """Predict emotion from audio (simplified demo)"""
109
- emotion_idx = self.ser_clf.predict(np.random.rand(1, 128))[0]
110
- return self.emotion_labels[emotion_idx]
 
 
 
 
 
 
111
 
112
- def generate_response(self, text, emotion):
113
- """Generate conversational response"""
114
- try:
115
- # Create emotion-aware prompt
116
- prompt = f"Human: {text}\nAssistant (feeling {emotion}):"
117
-
118
- inputs = self.tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True).to(DEVICE)
119
-
120
- with torch.no_grad():
121
- outputs = self.llm_model.generate(
122
- inputs,
123
- max_length=inputs.shape[1] + 100,
124
- temperature=TEMPERATURE,
125
- do_sample=True,
126
- pad_token_id=self.tokenizer.eos_token_id,
127
- no_repeat_ngram_size=2,
128
- top_p=0.9
129
- )
130
-
131
- response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
132
- response = response.split("Human:")[0].strip()
 
 
 
 
 
 
 
 
 
 
133
 
134
- return response if response else "I understand. Please tell me more."
135
- except Exception as e:
136
- print(f"LLM Error: {e}")
137
- return "I'm having trouble processing that. Could you please rephrase?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- def synthesize(self, text):
140
- """Convert text to speech"""
141
- try:
142
- if self.tts:
143
- wav = self.tts.tts(text=text)
144
- if isinstance(wav, list):
145
- wav = np.array(wav, dtype=np.float32)
146
- # Normalize audio
147
- wav = wav / np.max(np.abs(wav)) if np.max(np.abs(wav)) > 0 else wav
148
- return (SAMPLE_RATE, (wav * 32767).astype(np.int16))
149
- else:
150
- # Return silence if TTS fails
151
- return (SAMPLE_RATE, np.zeros(SAMPLE_RATE, dtype=np.int16))
152
- except Exception as e:
153
- print(f"TTS Error: {e}")
154
- return (SAMPLE_RATE, np.zeros(SAMPLE_RATE, dtype=np.int16))
155
 
156
- def process_conversation(self, audio_input, chat_history):
157
- """Main pipeline: Speech -> Emotion -> LLM -> Speech"""
158
- if audio_input is None:
159
- return chat_history, None, ""
160
 
161
- try:
162
- # Step 1: Speech to Text
163
- user_text = self.transcribe(audio_input)
164
- if not user_text.strip():
165
- return chat_history, None, "No speech detected."
166
-
167
- # Step 2: Emotion Recognition
168
- emotion = self.predict_emotion()
169
-
170
- # Step 3: Generate Response
171
- ai_response = self.generate_response(user_text, emotion)
172
-
173
- # Step 4: Text to Speech
174
- audio_response = self.synthesize(ai_response)
175
-
176
- # Update chat history
177
- chat_history.append([user_text, ai_response])
178
-
179
- # Memory cleanup
180
- if DEVICE == "cuda":
181
- torch.cuda.empty_cache()
182
- gc.collect()
183
-
184
- return chat_history, audio_response, f"You said: {user_text} (detected emotion: {emotion})"
185
 
186
- except Exception as e:
187
- error_msg = f"Error processing conversation: {e}"
188
- print(error_msg)
189
- return chat_history, None, error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- # Initialize AI system
192
- print("πŸš€ Starting Conversational AI...")
193
- ai_system = ConversationalAI()
 
 
 
 
194
 
195
- # Gradio Interface
196
  def create_interface():
197
  with gr.Blocks(
198
- title="Emotion-Aware Conversational AI",
199
- theme=gr.themes.Soft()
 
 
 
 
 
200
  ) as demo:
201
 
202
  gr.HTML("""
203
- <div style="text-align: center; margin-bottom: 2rem;">
204
- <h1>πŸ€– Emotion-Aware Conversational AI</h1>
205
- <p>Speak naturally and get intelligent responses with emotion recognition</p>
206
- </div>
 
207
  """)
208
 
209
  with gr.Row():
210
- with gr.Column(scale=2):
211
- chatbot = gr.Chatbot(
212
- label="Conversation History",
213
- height=400,
214
- show_copy_button=True
215
- )
216
 
 
 
217
  audio_input = gr.Audio(
218
- label="🎀 Speak to AI",
219
  sources=["microphone"],
220
  type="numpy",
221
- format="wav"
222
  )
223
 
224
- with gr.Row():
225
- submit_btn = gr.Button("πŸ’¬ Process Speech", variant="primary", scale=2)
226
- clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", variant="secondary", scale=1)
227
-
228
- with gr.Column(scale=1):
229
- audio_output = gr.Audio(
230
- label="πŸ”Š AI Response",
231
- type="numpy",
232
- autoplay=True
233
- )
234
 
 
235
  status_display = gr.Textbox(
236
  label="πŸ“Š Status",
237
- lines=3,
 
 
 
 
 
 
 
 
 
238
  interactive=False
239
  )
240
 
241
- gr.HTML(f"""
242
- <div style="padding: 1rem; background: #f0f9ff; border-radius: 0.5rem;">
243
- <h3>πŸ”§ System Info</h3>
244
- <p><strong>Device:</strong> {DEVICE.upper()}</p>
245
- <p><strong>PyTorch:</strong> {torch.__version__}</p>
246
- <p><strong>Models:</strong> Parakeet + DialoGPT + TTS</p>
247
- <p><strong>Features:</strong> Emotion Recognition</p>
248
- </div>
249
- """)
250
-
251
- def process_audio(audio, history):
252
- return ai_system.process_conversation(audio, history)
253
-
254
- def clear_conversation():
255
- if DEVICE == "cuda":
256
- torch.cuda.empty_cache()
257
- gc.collect()
258
- return [], None, "Conversation cleared."
259
-
260
- # Event handlers
261
- submit_btn.click(
262
- fn=process_audio,
263
- inputs=[audio_input, chatbot],
264
- outputs=[chatbot, audio_output, status_display]
 
 
 
 
 
 
265
  )
266
 
267
- clear_btn.click(
268
- fn=clear_conversation,
269
- outputs=[chatbot, audio_output, status_display]
 
270
  )
271
 
272
- audio_input.change(
273
- fn=process_audio,
274
- inputs=[audio_input, chatbot],
275
- outputs=[chatbot, audio_output, status_display]
276
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  return demo
279
 
280
- # Launch application
281
  if __name__ == "__main__":
282
- print("🌟 Creating interface...")
283
- demo = create_interface()
284
 
285
- print("πŸš€ Launching application...")
286
- demo.launch(
287
- server_name="0.0.0.0",
288
- server_port=7860,
289
- share=True,
290
- show_error=True
291
- )
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import librosa
5
+ import soundfile as sf
6
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
7
+ from dia.model import Dia
8
+ import warnings
9
+ import json
10
+ import time
11
+ from datetime import datetime
12
+ import os
13
 
14
+ warnings.filterwarnings("ignore")
 
 
 
15
 
16
+ # Global models
17
+ ultravox_pipe = None
18
+ qwen_model = None
19
+ qwen_tokenizer = None
20
+ dia_model = None
21
+ conversation_history = []
22
 
23
+ class ConversationManager:
24
+ def __init__(self, max_exchanges=5):
25
+ self.history = []
26
+ self.max_exchanges = max_exchanges
27
+ self.current_emotion = "neutral"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ def add_exchange(self, user_input, ai_response, emotion="neutral"):
30
+ self.history.append({
31
+ "timestamp": datetime.now().isoformat(),
32
+ "user": user_input,
33
+ "ai": ai_response,
34
+ "emotion": emotion
35
+ })
36
 
37
+ # Keep only last max_exchanges
38
+ if len(self.history) > self.max_exchanges:
39
+ self.history = self.history[-self.max_exchanges:]
40
+
41
+ def get_context(self):
42
+ context = ""
43
+ for exchange in self.history[-3:]: # Last 3 exchanges for context
44
+ context += f"User: {exchange['user']}\nAI: {exchange['ai']}\n"
45
+ return context
46
+
47
+ def clear(self):
48
+ self.history = []
49
+ self.current_emotion = "neutral"
50
+
51
+ def load_models():
52
+ """Load all models with optimized memory usage"""
53
+ global ultravox_pipe, qwen_model, qwen_tokenizer, dia_model
54
+
55
+ print("πŸš€ Loading Ultravox for ASR + Emotion Recognition...")
56
+ try:
57
+ ultravox_pipe = pipeline(
58
+ model='fixie-ai/ultravox-v0_4',
59
+ trust_remote_code=True,
60
  torch_dtype=torch.float16,
61
+ device_map="auto"
62
  )
63
+ print("βœ… Ultravox loaded successfully!")
64
+ except Exception as e:
65
+ print(f"❌ Error loading Ultravox: {e}")
66
+ return False
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ print("🧠 Loading Qwen2.5-1.5B for conversation...")
69
+ try:
70
+ qwen_tokenizer = AutoTokenizer.from_pretrained(
71
+ "Qwen/Qwen2.5-1.5B-Instruct",
72
+ trust_remote_code=True
73
+ )
74
+ qwen_model = AutoModelForCausalLM.from_pretrained(
75
+ "Qwen/Qwen2.5-1.5B-Instruct",
76
+ torch_dtype=torch.float16,
77
+ device_map="auto",
78
+ trust_remote_code=True
79
+ )
80
+ print("βœ… Qwen loaded successfully!")
81
+ except Exception as e:
82
+ print(f"❌ Error loading Qwen: {e}")
83
+ return False
84
 
85
+ print("πŸŽ™οΈ Loading Enhanced Dia TTS...")
86
+ try:
87
+ dia_model = Dia.from_pretrained(
88
+ "nari-labs/Dia-1.6B",
89
+ compute_dtype="float16"
90
+ )
91
+ print("βœ… Dia TTS loaded successfully!")
92
+ except Exception as e:
93
+ print(f"❌ Error loading Dia: {e}")
94
+ return False
95
 
96
+ return True
97
+
98
+ def detect_emotion_from_speech(audio_input):
99
+ """Extract emotion from speech using Ultravox understanding"""
100
+ try:
101
+ # Emotional keywords mapping
102
+ emotion_keywords = {
103
+ "happy": ["laugh", "excited", "joy", "great", "awesome", "wonderful"],
104
+ "sad": ["cry", "upset", "disappointed", "sorry", "terrible"],
105
+ "angry": ["mad", "furious", "annoyed", "frustrated"],
106
+ "surprised": ["wow", "amazing", "incredible", "unbelievable"],
107
+ "neutral": []
108
+ }
109
+
110
+ # Use Ultravox to understand speech context
111
+ turns = [
112
+ {"role": "system", "content": "Analyze the emotional tone of the user's speech. Respond with just the emotion: happy, sad, angry, surprised, or neutral."},
113
+ ]
114
+
115
+ result = ultravox_pipe({
116
+ 'audio': audio_input,
117
+ 'turns': turns,
118
+ 'sampling_rate': 16000
119
+ }, max_new_tokens=10)
120
+
121
+ detected_emotion = result[0]['generated_text'].lower().strip()
122
+
123
+ # Validate emotion
124
+ valid_emotions = ["happy", "sad", "angry", "surprised", "neutral"]
125
+ if detected_emotion not in valid_emotions:
126
+ detected_emotion = "neutral"
127
 
128
+ return detected_emotion
129
+ except:
130
+ return "neutral"
131
+
132
+ def speech_to_text_with_emotion(audio_input):
133
+ """Convert speech to text and detect emotion"""
134
+ try:
135
+ if audio_input is None:
136
+ return "", "neutral"
137
+
138
+ # Convert audio format if needed
139
+ if isinstance(audio_input, tuple):
140
+ sample_rate, audio_data = audio_input
141
+ audio_data = audio_data.astype(np.float32)
142
+ if len(audio_data.shape) > 1:
143
+ audio_data = audio_data.mean(axis=1)
144
+ else:
145
+ audio_data = audio_input
146
+ sample_rate = 16000
147
+
148
+ # Resample to 16kHz if needed
149
+ if sample_rate != 16000:
150
+ audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
151
+
152
+ # Speech to text using Ultravox
153
+ turns = [
154
+ {"role": "system", "content": "Transcribe the user's speech accurately. Only provide the transcription."},
155
+ ]
156
+
157
+ result = ultravox_pipe({
158
+ 'audio': audio_data,
159
+ 'turns': turns,
160
+ 'sampling_rate': 16000
161
+ }, max_new_tokens=100)
162
+
163
+ transcription = result[0]['generated_text'].strip()
164
+
165
+ # Detect emotion
166
+ emotion = detect_emotion_from_speech(audio_data)
167
+
168
+ return transcription, emotion
169
+
170
+ except Exception as e:
171
+ print(f"Error in STT: {e}")
172
+ return "Sorry, I couldn't understand that.", "neutral"
173
+
174
+ def generate_contextual_response(user_input, emotion, conversation_manager):
175
+ """Generate contextual response using Qwen"""
176
+ try:
177
+ context = conversation_manager.get_context()
178
+
179
+ # Emotional system prompt
180
+ emotional_prompts = {
181
+ "happy": "Respond with enthusiasm and joy. Use exclamations and positive language.",
182
+ "sad": "Respond with empathy and comfort. Be gentle and understanding.",
183
+ "angry": "Respond calmly and try to de-escalate. Be patient and helpful.",
184
+ "surprised": "Share in the surprise and excitement. Be engaging and curious.",
185
+ "neutral": "Respond naturally and conversationally."
186
+ }
187
+
188
+ system_prompt = f"""You are Maya, a friendly and emotionally intelligent AI assistant.
189
+ {emotional_prompts.get(emotion, emotional_prompts['neutral'])}
190
+
191
+ Previous conversation context:
192
+ {context}
193
+
194
+ Current user emotion: {emotion}
195
+
196
+ Guidelines:
197
+ - Keep responses concise (1-2 sentences)
198
+ - Match the user's emotional tone
199
+ - Be natural and conversational
200
+ - Include emotional expressions when appropriate like (laughs), (sighs), etc.
201
+ """
202
+
203
+ messages = [
204
+ {"role": "system", "content": system_prompt},
205
+ {"role": "user", "content": user_input}
206
+ ]
207
+
208
+ # Generate response
209
+ text = qwen_tokenizer.apply_chat_template(
210
+ messages,
211
+ tokenize=False,
212
+ add_generation_prompt=True
213
+ )
214
+
215
+ model_inputs = qwen_tokenizer([text], return_tensors="pt").to(qwen_model.device)
216
+
217
+ with torch.no_grad():
218
+ generated_ids = qwen_model.generate(
219
+ model_inputs.input_ids,
220
+ max_new_tokens=100,
221
+ do_sample=True,
222
+ temperature=0.7,
223
+ pad_token_id=qwen_tokenizer.eos_token_id
224
+ )
225
+
226
+ generated_ids = [
227
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
228
+ ]
229
+
230
+ response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
231
+
232
+ return response.strip()
233
+
234
+ except Exception as e:
235
+ print(f"Error in response generation: {e}")
236
+ return "I'm sorry, I'm having trouble processing that right now."
237
+
238
+ def text_to_speech_emotional(text, emotion="neutral", speaker="S1"):
239
+ """Convert text to emotional speech using enhanced Dia"""
240
+ try:
241
+ # Clear GPU cache
242
+ if torch.cuda.is_available():
243
+ torch.cuda.empty_cache()
244
+
245
+ # Emotional markers for Dia
246
+ emotional_markers = {
247
+ "happy": "(excited) ",
248
+ "sad": "(sad) ",
249
+ "angry": "(frustrated) ",
250
+ "surprised": "(surprised) ",
251
+ "neutral": ""
252
+ }
253
+
254
+ # Add emotional context and natural pauses
255
+ enhanced_text = f"[{speaker}] {emotional_markers.get(emotion, '')}{text}"
256
+
257
+ # Add natural breathing pauses for longer text
258
+ if len(text) > 50:
259
+ enhanced_text = enhanced_text.replace(". ", ". (pause) ")
260
+ enhanced_text = enhanced_text.replace("! ", "! (pause) ")
261
+ enhanced_text = enhanced_text.replace("? ", "? (pause) ")
262
+
263
+ print(f"Generating TTS for: {enhanced_text[:100]}...")
264
+
265
+ # Generate audio
266
+ with torch.no_grad():
267
+ audio_output = dia_model.generate(
268
+ enhanced_text,
269
+ use_torch_compile=False,
270
+ verbose=False
271
+ )
272
+
273
+ # Process audio output
274
+ if isinstance(audio_output, torch.Tensor):
275
+ audio_output = audio_output.cpu().numpy()
276
+
277
+ # Normalize audio
278
+ if len(audio_output) > 0:
279
+ max_val = np.max(np.abs(audio_output))
280
+ if max_val > 1.0:
281
+ audio_output = audio_output / max_val * 0.95
282
+
283
+ return (44100, audio_output)
284
+
285
+ except Exception as e:
286
+ print(f"Error in TTS: {e}")
287
+ return None
288
+
289
+ # Initialize conversation manager
290
+ conv_manager = ConversationManager()
291
+
292
+ def start_call():
293
+ """Initialize call and return greeting"""
294
+ conv_manager.clear()
295
+ greeting_text = "Hello! I'm Maya, your AI assistant. How can I help you today?"
296
+ greeting_audio = text_to_speech_emotional(greeting_text, "happy")
297
 
298
+ return greeting_audio, greeting_text, "Call started! πŸ“ž"
299
+
300
+ def process_conversation(audio_input):
301
+ """Main conversation processing pipeline"""
302
+ if audio_input is None:
303
+ return None, "Please record some audio first.", "", "No audio input received."
 
 
 
 
 
 
 
 
 
 
304
 
305
+ try:
306
+ # Step 1: Speech to Text + Emotion Detection
307
+ user_text, emotion = speech_to_text_with_emotion(audio_input)
 
308
 
309
+ if not user_text or user_text.strip() == "":
310
+ return None, "I didn't catch that. Could you please repeat?", "", "No speech detected."
311
+
312
+ # Step 2: Generate contextual response
313
+ ai_response = generate_contextual_response(user_text, emotion, conv_manager)
314
+
315
+ # Step 3: Convert to speech
316
+ response_audio = text_to_speech_emotional(ai_response, emotion)
317
+
318
+ # Step 4: Update conversation history
319
+ conv_manager.add_exchange(user_text, ai_response, emotion)
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
+ status = f"βœ… Processed | Emotion: {emotion} | Exchange: {len(conv_manager.history)}/5"
322
+
323
+ return response_audio, ai_response, user_text, status
324
+
325
+ except Exception as e:
326
+ error_msg = f"❌ Error processing conversation: {str(e)}"
327
+ return None, "I'm sorry, I encountered an error. Please try again.", "", error_msg
328
+
329
+ def get_conversation_history():
330
+ """Return formatted conversation history"""
331
+ if not conv_manager.history:
332
+ return "No conversation history yet."
333
+
334
+ history_text = "πŸ“‹ **Conversation History:**\n\n"
335
+ for i, exchange in enumerate(conv_manager.history, 1):
336
+ timestamp = exchange['timestamp'][:19].replace('T', ' ')
337
+ history_text += f"**Exchange {i}** ({timestamp}) - Emotion: {exchange['emotion']}\n"
338
+ history_text += f"πŸ‘€ **You:** {exchange['user']}\n"
339
+ history_text += f"πŸ€– **Maya:** {exchange['ai']}\n\n"
340
+
341
+ return history_text
342
 
343
+ def end_call():
344
+ """End call and clear conversation"""
345
+ farewell_text = "Thank you for talking with me! Have a great day!"
346
+ farewell_audio = text_to_speech_emotional(farewell_text, "happy")
347
+ conv_manager.clear()
348
+
349
+ return farewell_audio, farewell_text, "Call ended. πŸ“žβŒ"
350
 
351
+ # Create Gradio Interface
352
  def create_interface():
353
  with gr.Blocks(
354
+ title="Maya AI - Advanced Speech-to-Speech Assistant",
355
+ theme=gr.themes.Soft(),
356
+ css="""
357
+ .call-button { background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important; }
358
+ .record-button { background: linear-gradient(45deg, #45B7D1, #96CEB4) !important; }
359
+ .end-button { background: linear-gradient(45deg, #FFA07A, #FF6347) !important; }
360
+ """
361
  ) as demo:
362
 
363
  gr.HTML("""
364
+ <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 20px;">
365
+ <h1 style="color: white; margin: 0; font-size: 2.5em;">πŸŽ™οΈ Maya AI</h1>
366
+ <p style="color: white; margin: 10px 0; font-size: 1.2em;">Advanced Speech-to-Speech Conversational AI</p>
367
+ <p style="color: #E8E8E8; margin: 0;">Natural β€’ Emotional β€’ Contextual</p>
368
+ </div>
369
  """)
370
 
371
  with gr.Row():
372
+ with gr.Column(scale=1):
373
+ # Call Controls
374
+ gr.HTML("<h3>πŸ“ž Call Controls</h3>")
375
+ start_btn = gr.Button("πŸ“ž Start Call", elem_classes="call-button", size="lg")
376
+ end_btn = gr.Button("πŸ“žβŒ End Call", elem_classes="end-button", size="lg")
 
377
 
378
+ # Audio Input
379
+ gr.HTML("<h3>🎀 Voice Input</h3>")
380
  audio_input = gr.Audio(
381
+ label="Record Your Message",
382
  sources=["microphone"],
383
  type="numpy",
384
+ elem_classes="record-button"
385
  )
386
 
387
+ process_btn = gr.Button("🎯 Process Message", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
388
 
389
+ # Status
390
  status_display = gr.Textbox(
391
  label="πŸ“Š Status",
392
+ interactive=False,
393
+ lines=2
394
+ )
395
+
396
+ with gr.Column(scale=2):
397
+ # AI Response Audio
398
+ gr.HTML("<h3>πŸ”Š Maya's Response</h3>")
399
+ response_audio = gr.Audio(
400
+ label="Maya's Voice Response",
401
+ type="numpy",
402
  interactive=False
403
  )
404
 
405
+ # Text Displays
406
+ with gr.Row():
407
+ with gr.Column():
408
+ user_text_display = gr.Textbox(
409
+ label="πŸ‘€ What You Said",
410
+ interactive=False,
411
+ lines=3
412
+ )
413
+
414
+ with gr.Column():
415
+ ai_text_display = gr.Textbox(
416
+ label="πŸ€– Maya's Response",
417
+ interactive=False,
418
+ lines=3
419
+ )
420
+
421
+ # Conversation History
422
+ with gr.Row():
423
+ with gr.Column():
424
+ gr.HTML("<h3>πŸ“‹ Conversation History</h3>")
425
+ history_btn = gr.Button("πŸ“‹ Show History", variant="secondary")
426
+ history_display = gr.Markdown(
427
+ value="No conversation history yet.",
428
+ label="Conversation Log"
429
+ )
430
+
431
+ # Event Handlers
432
+ start_btn.click(
433
+ fn=start_call,
434
+ outputs=[response_audio, ai_text_display, status_display]
435
  )
436
 
437
+ process_btn.click(
438
+ fn=process_conversation,
439
+ inputs=[audio_input],
440
+ outputs=[response_audio, ai_text_display, user_text_display, status_display]
441
  )
442
 
443
+ end_btn.click(
444
+ fn=end_call,
445
+ outputs=[response_audio, ai_text_display, status_display]
 
446
  )
447
+
448
+ history_btn.click(
449
+ fn=get_conversation_history,
450
+ outputs=[history_display]
451
+ )
452
+
453
+ # Usage Instructions
454
+ gr.HTML("""
455
+ <div style="margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 10px; border-left: 5px solid #007bff;">
456
+ <h3>πŸ’‘ How to Use Maya AI:</h3>
457
+ <ol>
458
+ <li><strong>Start Call:</strong> Click "πŸ“ž Start Call" to begin your conversation</li>
459
+ <li><strong>Record:</strong> Use the microphone to record your message</li>
460
+ <li><strong>Process:</strong> Click "🎯 Process Message" to get Maya's response</li>
461
+ <li><strong>Listen:</strong> Maya will respond with natural, emotional speech</li>
462
+ <li><strong>Continue:</strong> Keep the conversation going (up to 5 exchanges)</li>
463
+ <li><strong>End:</strong> Click "πŸ“žβŒ End Call" when finished</li>
464
+ </ol>
465
+
466
+ <h4>🎭 Emotional Features:</h4>
467
+ <p>Maya automatically detects your emotions and responds accordingly with natural expressions, breathing pauses, and contextual understanding!</p>
468
+ </div>
469
+ """)
470
 
471
  return demo
472
 
 
473
  if __name__ == "__main__":
474
+ print("πŸš€ Initializing Maya AI System...")
 
475
 
476
+ if load_models():
477
+ print("βœ… All models loaded successfully!")
478
+ print("🌟 Launching Maya AI Interface...")
479
+
480
+ demo = create_interface()
481
+ demo.launch(
482
+ server_name="0.0.0.0",
483
+ server_port=7860,
484
+ share=True,
485
+ show_error=True
486
+ )
487
+ else:
488
+ print("❌ Failed to load models. Please check your setup.")