Devakumar868 commited on
Commit
a8603f3
·
verified ·
1 Parent(s): 2a75609

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -401
app.py CHANGED
@@ -13,7 +13,7 @@ from collections import deque
13
  import psutil
14
  import gc
15
 
16
- # Import models
17
  from dia.model import Dia
18
  from transformers import pipeline
19
  import webrtcvad
@@ -38,56 +38,41 @@ class EmotionRecognizer:
38
  model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
39
  device=0 if torch.cuda.is_available() else -1
40
  )
41
-
42
  def detect_emotion(self, audio: np.ndarray, sample_rate: int = 16000) -> str:
43
  try:
44
  result = self.emotion_pipeline({"array": audio, "sampling_rate": sample_rate})
45
  return result[0]["label"] if result else "neutral"
46
- except Exception as e:
47
- print(f"Emotion detection error: {e}")
48
  return "neutral"
49
 
50
  class VADProcessor:
51
  def __init__(self, aggressiveness: int = 2):
52
  self.vad = webrtcvad.Vad(aggressiveness)
53
  self.sample_rate = 16000
54
- self.frame_duration = 30 # ms
55
  self.frame_size = int(self.sample_rate * self.frame_duration / 1000)
56
-
57
  def is_speech(self, audio: np.ndarray) -> bool:
58
- try:
59
- # Convert to 16-bit PCM
60
- audio_int16 = (audio * 32767).astype(np.int16)
61
-
62
- # Process in frames
63
- frames = []
64
- for i in range(0, len(audio_int16) - self.frame_size, self.frame_size):
65
- frame = audio_int16[i:i + self.frame_size].tobytes()
66
- frames.append(self.vad.is_speech(frame, self.sample_rate))
67
-
68
- # Return True if majority of frames contain speech
69
- return sum(frames) > len(frames) * 0.3
70
- except Exception:
71
- return True # Default to treating as speech
72
 
73
  class ConversationManager:
74
  def __init__(self, max_exchanges: int = 50):
75
  self.conversations: Dict[str, deque] = {}
76
  self.max_exchanges = max_exchanges
77
  self.lock = threading.RLock()
78
-
79
  def add_turn(self, session_id: str, turn: ConversationTurn):
80
  with self.lock:
81
  if session_id not in self.conversations:
82
  self.conversations[session_id] = deque(maxlen=self.max_exchanges)
83
  self.conversations[session_id].append(turn)
84
-
85
  def get_context(self, session_id: str, last_n: int = 5) -> List[ConversationTurn]:
86
  with self.lock:
87
- if session_id not in self.conversations:
88
- return []
89
- return list(self.conversations[session_id])[-last_n:]
90
-
91
  def clear_session(self, session_id: str):
92
  with self.lock:
93
  if session_id in self.conversations:
@@ -97,25 +82,16 @@ class SupernaturalAI:
97
  def __init__(self):
98
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
99
  self.models_loaded = False
100
- self.processing_queue = queue.Queue()
101
  self.conversation_manager = ConversationManager()
 
102
  self.emotion_recognizer = None
103
  self.vad_processor = VADProcessor()
104
-
105
- # Models
106
  self.ultravox_model = None
107
  self.dia_model = None
108
-
109
- # Performance tracking
110
- self.active_sessions = set()
111
- self.processing_times = deque(maxlen=100)
112
-
113
- print("Initializing Supernatural AI...")
114
  self._initialize_models()
115
-
116
  def _initialize_models(self):
117
  try:
118
- print("Loading Ultravox model...")
119
  self.ultravox_model = pipeline(
120
  'automatic-speech-recognition',
121
  model='fixie-ai/ultravox-v0_2',
@@ -123,386 +99,105 @@ class SupernaturalAI:
123
  device=0 if torch.cuda.is_available() else -1,
124
  torch_dtype=torch.float16
125
  )
126
-
127
- print("Loading Dia TTS model...")
128
  self.dia_model = Dia.from_pretrained(
129
- "nari-labs/Dia-1.6B",
130
- compute_dtype="float16"
131
  )
132
-
133
- print("Loading emotion recognition...")
134
  self.emotion_recognizer = EmotionRecognizer()
135
-
136
  self.models_loaded = True
137
- print("✅ All models loaded successfully!")
138
-
139
- # Memory cleanup
140
  if torch.cuda.is_available():
141
  torch.cuda.empty_cache()
142
-
143
  except Exception as e:
144
- print(f" Error loading models: {e}")
145
  self.models_loaded = False
146
-
147
- def _get_memory_usage(self) -> Dict[str, float]:
148
- """Get current memory usage statistics"""
149
- memory = psutil.virtual_memory()
150
- gpu_memory = {}
151
-
152
- if torch.cuda.is_available():
153
- for i in range(torch.cuda.device_count()):
154
- gpu_memory[f"GPU_{i}"] = {
155
- "allocated": torch.cuda.memory_allocated(i) / 1024**3,
156
- "cached": torch.cuda.memory_reserved(i) / 1024**3
157
- }
158
-
159
- return {
160
- "RAM": memory.percent,
161
- "GPU": gpu_memory
162
- }
163
-
164
- def _generate_contextual_prompt(self,
165
- user_text: str,
166
- emotion: str,
167
- context: List[ConversationTurn]) -> str:
168
- """Generate contextual prompt with emotion and conversation history"""
169
-
170
- # Build context from previous turns
171
- context_text = ""
172
- if context:
173
- for turn in context[-3:]: # Last 3 exchanges
174
- context_text += f"[S1] {turn.user_text} [S2] {turn.ai_response_text} "
175
-
176
- # Emotion-aware response generation
177
- emotion_modifiers = {
178
- "happy": "(cheerful)",
179
- "sad": "(sympathetic)",
180
- "angry": "(calming)",
181
- "fear": "(reassuring)",
182
- "surprise": "(excited)",
183
- "neutral": ""
184
- }
185
-
186
- modifier = emotion_modifiers.get(emotion.lower(), "")
187
-
188
- # Create supernatural AI personality
189
- prompt = f"{context_text}[S1] {user_text} [S2] {modifier} As a supernatural AI with deep emotional understanding, I sense your {emotion} energy. "
190
-
191
- return prompt
192
-
193
- def process_audio_input(self,
194
- audio_data: Tuple[int, np.ndarray],
195
- session_id: str) -> Tuple[Optional[Tuple[int, np.ndarray]], str, str]:
196
- """Main processing pipeline for audio input"""
197
-
198
- if not self.models_loaded:
199
- return None, "❌ Models not loaded", "Please wait for initialization"
200
-
201
- if audio_data is None:
202
- return None, "❌ No audio received", "Please record some audio"
203
-
204
- start_time = time.time()
205
-
206
  try:
207
- sample_rate, audio = audio_data
208
-
209
- # Ensure audio is mono and proper format
210
- if len(audio.shape) > 1:
211
- audio = np.mean(audio, axis=1)
212
-
213
- # Normalize audio
214
- audio = audio.astype(np.float32)
215
- if np.max(np.abs(audio)) > 0:
216
- audio = audio / np.max(np.abs(audio)) * 0.95
217
-
218
- # Voice Activity Detection
219
- if not self.vad_processor.is_speech(audio):
220
- return None, "🔇 No speech detected", "Please speak clearly"
221
-
222
- # Resample if needed
223
- if sample_rate != 16000:
224
- audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
225
- sample_rate = 16000
226
-
227
- # Speech Recognition with Ultravox
228
- try:
229
- speech_result = self.ultravox_model({
230
- 'array': audio,
231
- 'sampling_rate': sample_rate
232
- })
233
- user_text = speech_result.get('text', '').strip()
234
-
235
- if not user_text:
236
- return None, "❌ Could not understand speech", "Please speak more clearly"
237
-
238
- except Exception as e:
239
- print(f"ASR Error: {e}")
240
- return None, f"❌ Speech recognition failed: {str(e)}", "Please try again"
241
-
242
- # Emotion Recognition
243
- emotion = self.emotion_recognizer.detect_emotion(audio, sample_rate)
244
-
245
- # Get conversation context
246
- context = self.conversation_manager.get_context(session_id)
247
-
248
- # Generate contextual response
249
- prompt = self._generate_contextual_prompt(user_text, emotion, context)
250
-
251
- # Generate speech with Dia TTS
252
- try:
253
- with torch.no_grad():
254
- audio_output = self.dia_model.generate(
255
- prompt,
256
- use_torch_compile=False, # Better stability
257
- verbose=False
258
- )
259
-
260
- # Ensure audio output is proper format
261
- if isinstance(audio_output, torch.Tensor):
262
- audio_output = audio_output.cpu().numpy()
263
-
264
- # Normalize output
265
- if len(audio_output) > 0:
266
- max_val = np.max(np.abs(audio_output))
267
- if max_val > 1.0:
268
- audio_output = audio_output / max_val * 0.95
269
-
270
- except Exception as e:
271
- print(f"TTS Error: {e}")
272
- return None, f"❌ Speech generation failed: {str(e)}", "Please try again"
273
-
274
- # Extract AI response text (remove speaker tags and modifiers)
275
- ai_response = prompt.split('[S2]')[-1].strip()
276
- ai_response = ai_response.replace('(cheerful)', '').replace('(sympathetic)', '')
277
- ai_response = ai_response.replace('(calming)', '').replace('(reassuring)', '')
278
- ai_response = ai_response.replace('(excited)', '').strip()
279
-
280
- # Store conversation turn
281
- turn = ConversationTurn(
282
- user_audio=audio,
283
- user_text=user_text,
284
- ai_response_text=ai_response,
285
- ai_response_audio=audio_output,
286
- timestamp=time.time(),
287
- emotion=emotion,
288
- speaker_id=session_id
289
- )
290
-
291
- self.conversation_manager.add_turn(session_id, turn)
292
-
293
- # Track performance
294
- processing_time = time.time() - start_time
295
- self.processing_times.append(processing_time)
296
-
297
- # Memory cleanup
298
- if torch.cuda.is_available():
299
- torch.cuda.empty_cache()
300
- gc.collect()
301
-
302
- status = f"✅ Processed in {processing_time:.2f}s | Emotion: {emotion} | Users: {len(self.active_sessions)}"
303
-
304
- return (44100, audio_output), status, f"**You said:** {user_text}\n\n**AI Response:** {ai_response}"
305
-
306
  except Exception as e:
307
- print(f"Processing error: {e}")
308
- return None, f"❌ Processing failed: {str(e)}", "Please try again"
309
-
310
- def get_conversation_history(self, session_id: str) -> str:
311
- """Get formatted conversation history"""
312
- context = self.conversation_manager.get_context(session_id, last_n=10)
313
- if not context:
314
- return "No conversation history yet."
315
-
316
- history = "## Conversation History\n\n"
317
- for i, turn in enumerate(context, 1):
318
- history += f"**Turn {i}:**\n"
319
- history += f"- **You:** {turn.user_text}\n"
320
- history += f"- **AI:** {turn.ai_response_text}\n"
321
- history += f"- **Emotion Detected:** {turn.emotion}\n\n"
322
-
323
- return history
324
-
325
- def clear_conversation(self, session_id: str) -> str:
326
- """Clear conversation history for session"""
327
- self.conversation_manager.clear_session(session_id)
328
- return "Conversation history cleared."
329
-
330
- def get_system_status(self) -> str:
331
- """Get system status information"""
332
- memory = self._get_memory_usage()
333
- avg_processing = np.mean(self.processing_times) if self.processing_times else 0
334
-
335
- status = f"""## System Status
336
-
337
- **Performance:**
338
- - Average Processing Time: {avg_processing:.2f}s
339
- - Active Sessions: {len(self.active_sessions)}
340
- - Total Conversations: {len(self.conversation_manager.conversations)}
341
 
342
- **Memory Usage:**
343
- - RAM: {memory['RAM']:.1f}%
344
- - GPU Memory: {memory.get('GPU', {})}
345
 
346
- **Models Status:**
347
- - Models Loaded: {"✅" if self.models_loaded else "❌"}
348
- - Device: {self.device}
349
- """
350
- return status
 
351
 
352
- # Initialize the AI system
353
- print("Starting Supernatural AI system...")
354
- ai_system = SupernaturalAI()
355
 
356
- # Gradio Interface
357
- def process_audio_interface(audio, session_id):
358
- """Interface function for Gradio"""
359
- if not session_id:
360
- session_id = f"user_{int(time.time())}"
361
-
362
- ai_system.active_sessions.add(session_id)
363
- result = ai_system.process_audio_input(audio, session_id)
364
- return result + (session_id,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
- def get_history_interface(session_id):
367
- """Get conversation history interface"""
368
- if not session_id:
369
- return "No session ID provided"
370
- return ai_system.get_conversation_history(session_id)
371
 
372
- def clear_history_interface(session_id):
373
- """Clear history interface"""
374
- if not session_id:
375
- return "No session ID provided"
376
- return ai_system.clear_conversation(session_id)
 
377
 
378
- # Create Gradio interface
379
- with gr.Blocks(title="Supernatural Conversational AI", theme=gr.themes.Soft()) as demo:
380
- gr.HTML("""
381
- <div style="text-align: center; padding: 20px;">
382
- <h1>🧙‍♂️ Supernatural Conversational AI</h1>
383
- <p style="font-size: 18px; color: #666;">
384
- Advanced Speech-to-Speech AI with Emotional Intelligence
385
- </p>
386
- <p style="color: #888;">
387
- Powered by Ultravox + Dia TTS | Optimized for 4x L4 GPUs
388
- </p>
389
- </div>
390
- """)
391
-
392
- with gr.Row():
393
- with gr.Column(scale=2):
394
- # Audio input/output
395
- audio_input = gr.Audio(
396
- label="🎤 Speak to the AI",
397
- sources=["microphone"],
398
- type="numpy",
399
- streaming=False
400
- )
401
-
402
- audio_output = gr.Audio(
403
- label="🔊 AI Response",
404
- type="numpy",
405
- autoplay=True
406
- )
407
-
408
- # Session management
409
- session_id = gr.Textbox(
410
- label="Session ID",
411
- placeholder="Auto-generated if empty",
412
- value="",
413
- interactive=True
414
- )
415
-
416
- # Process button
417
- process_btn = gr.Button("🎯 Process Audio", variant="primary", size="lg")
418
-
419
- with gr.Column(scale=1):
420
- # Status and conversation
421
- status_display = gr.Textbox(
422
- label="📊 Status",
423
- interactive=False,
424
- lines=3
425
- )
426
-
427
- conversation_display = gr.Markdown(
428
- label="💬 Conversation",
429
- value="Start speaking to begin..."
430
- )
431
-
432
- # History management
433
- with gr.Row():
434
- history_btn = gr.Button("📜 Show History", size="sm")
435
- clear_btn = gr.Button("🗑️ Clear History", size="sm")
436
- status_btn = gr.Button("⚡ System Status", size="sm")
437
-
438
- # History and status display
439
- history_display = gr.Markdown(
440
- label="📚 Conversation History",
441
- value="No history yet."
442
- )
443
-
444
- # Event handlers
445
- process_btn.click(
446
- fn=process_audio_interface,
447
- inputs=[audio_input, session_id],
448
- outputs=[audio_output, status_display, conversation_display, session_id]
449
- )
450
-
451
- history_btn.click(
452
- fn=get_history_interface,
453
- inputs=[session_id],
454
- outputs=[history_display]
455
- )
456
-
457
- clear_btn.click(
458
- fn=clear_history_interface,
459
- inputs=[session_id],
460
- outputs=[history_display]
461
- )
462
-
463
- status_btn.click(
464
- fn=lambda: ai_system.get_system_status(),
465
- outputs=[history_display]
466
- )
467
-
468
- # Auto-process on audio input
469
- audio_input.change(
470
- fn=process_audio_interface,
471
- inputs=[audio_input, session_id],
472
- outputs=[audio_output, status_display, conversation_display, session_id]
473
- )
474
-
475
- # Usage instructions
476
- gr.HTML("""
477
- <div style="margin-top: 20px; padding: 15px; background: #f0f8ff; border-radius: 8px;">
478
- <h3>💡 Usage Instructions:</h3>
479
- <ul>
480
- <li><strong>Record Audio:</strong> Click the microphone and speak naturally</li>
481
- <li><strong>Emotional AI:</strong> The AI detects and responds to your emotions</li>
482
- <li><strong>Conversation Memory:</strong> Up to 50 exchanges are remembered</li>
483
- <li><strong>Session Management:</strong> Use Session ID to maintain separate conversations</li>
484
- <li><strong>Performance:</strong> Optimized for sub-500ms latency</li>
485
- </ul>
486
-
487
- <p><strong>Supported Features:</strong> Emotion recognition, voice activity detection,
488
- contextual responses, conversation history, concurrent users (15-20), memory management</p>
489
- </div>
490
- """)
491
 
492
- # Configure for optimal performance
493
- demo.queue(
494
- concurrency_count=20, # Support 20 concurrent users
495
- max_size=100,
496
- api_open=False
497
- )
498
 
499
- if __name__ == "__main__":
500
- demo.launch(
501
- server_name="0.0.0.0",
502
- server_port=7860,
503
- share=False,
504
- show_error=True,
505
- quiet=False,
506
- enable_queue=True,
507
- max_threads=40
508
- )
 
13
  import psutil
14
  import gc
15
 
16
+ # Models and pipelines
17
  from dia.model import Dia
18
  from transformers import pipeline
19
  import webrtcvad
 
38
  model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
39
  device=0 if torch.cuda.is_available() else -1
40
  )
 
41
  def detect_emotion(self, audio: np.ndarray, sample_rate: int = 16000) -> str:
42
  try:
43
  result = self.emotion_pipeline({"array": audio, "sampling_rate": sample_rate})
44
  return result[0]["label"] if result else "neutral"
45
+ except Exception:
 
46
  return "neutral"
47
 
48
  class VADProcessor:
49
  def __init__(self, aggressiveness: int = 2):
50
  self.vad = webrtcvad.Vad(aggressiveness)
51
  self.sample_rate = 16000
52
+ self.frame_duration = 30
53
  self.frame_size = int(self.sample_rate * self.frame_duration / 1000)
54
+
55
  def is_speech(self, audio: np.ndarray) -> bool:
56
+ audio_int16 = (audio * 32767).astype(np.int16)
57
+ frames = []
58
+ for i in range(0, len(audio_int16) - self.frame_size, self.frame_size):
59
+ frame = audio_int16[i : i + self.frame_size].tobytes()
60
+ frames.append(self.vad.is_speech(frame, self.sample_rate))
61
+ return sum(frames) > len(frames) * 0.3
 
 
 
 
 
 
 
 
62
 
63
  class ConversationManager:
64
  def __init__(self, max_exchanges: int = 50):
65
  self.conversations: Dict[str, deque] = {}
66
  self.max_exchanges = max_exchanges
67
  self.lock = threading.RLock()
 
68
  def add_turn(self, session_id: str, turn: ConversationTurn):
69
  with self.lock:
70
  if session_id not in self.conversations:
71
  self.conversations[session_id] = deque(maxlen=self.max_exchanges)
72
  self.conversations[session_id].append(turn)
 
73
  def get_context(self, session_id: str, last_n: int = 5) -> List[ConversationTurn]:
74
  with self.lock:
75
+ return list(self.conversations.get(session_id, []))[-last_n:]
 
 
 
76
  def clear_session(self, session_id: str):
77
  with self.lock:
78
  if session_id in self.conversations:
 
82
  def __init__(self):
83
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
84
  self.models_loaded = False
 
85
  self.conversation_manager = ConversationManager()
86
+ self.processing_times = deque(maxlen=100)
87
  self.emotion_recognizer = None
88
  self.vad_processor = VADProcessor()
 
 
89
  self.ultravox_model = None
90
  self.dia_model = None
 
 
 
 
 
 
91
  self._initialize_models()
92
+
93
  def _initialize_models(self):
94
  try:
 
95
  self.ultravox_model = pipeline(
96
  'automatic-speech-recognition',
97
  model='fixie-ai/ultravox-v0_2',
 
99
  device=0 if torch.cuda.is_available() else -1,
100
  torch_dtype=torch.float16
101
  )
 
 
102
  self.dia_model = Dia.from_pretrained(
103
+ "nari-labs/Dia-1.6B", compute_dtype="float16"
 
104
  )
 
 
105
  self.emotion_recognizer = EmotionRecognizer()
 
106
  self.models_loaded = True
 
 
 
107
  if torch.cuda.is_available():
108
  torch.cuda.empty_cache()
 
109
  except Exception as e:
110
+ print(f"Model load error: {e}")
111
  self.models_loaded = False
112
+
113
+ def process_audio_input(self, audio_data: Tuple[int, np.ndarray], session_id: str):
114
+ if not self.models_loaded or audio_data is None:
115
+ return None, "Models not ready", "Please wait"
116
+ start = time.time()
117
+ sample_rate, audio = audio_data
118
+ if len(audio.shape) > 1:
119
+ audio = np.mean(audio, axis=1)
120
+ audio = audio.astype(np.float32)
121
+ if np.max(np.abs(audio)) > 0:
122
+ audio = audio / np.max(np.abs(audio)) * 0.95
123
+ if not self.vad_processor.is_speech(audio):
124
+ return None, "No speech detected", "Speak clearly"
125
+
126
+ if sample_rate != 16000:
127
+ audio = librosa.resample(audio, sample_rate, 16000)
128
+ sample_rate = 16000
129
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  try:
131
+ result = self.ultravox_model({'array': audio, 'sampling_rate': sample_rate})
132
+ user_text = result.get('text', '').strip()
133
+ if not user_text:
134
+ return None, "Could not understand", "Try again"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  except Exception as e:
136
+ return None, f"ASR error: {e}", "Retry"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
+ emotion = self.emotion_recognizer.detect_emotion(audio, sample_rate)
139
+ context = self.conversation_manager.get_context(session_id)
140
+ prompt = self._build_prompt(user_text, emotion, context)
141
 
142
+ try:
143
+ with torch.no_grad():
144
+ audio_out = self.dia_model.generate(prompt, use_torch_compile=False)
145
+ audio_out = audio_out.cpu().numpy() if isinstance(audio_out, torch.Tensor) else audio_out
146
+ except Exception as e:
147
+ return None, f"TTS error: {e}", "Retry"
148
 
149
+ ai_text = prompt.split('[S2]')[-1].strip()
150
+ turn = ConversationTurn(audio, user_text, ai_text, audio_out, time.time(), emotion, session_id)
151
+ self.conversation_manager.add_turn(session_id, turn)
152
 
153
+ elapsed = time.time() - start
154
+ self.processing_times.append(elapsed)
155
+ if torch.cuda.is_available():
156
+ torch.cuda.empty_cache()
157
+ gc.collect()
158
+
159
+ status = f"Processed in {elapsed:.2f}s | Emotion: {emotion}"
160
+ return (44100, audio_out), status, f"You: {user_text}\n\nAI: {ai_text}"
161
+
162
+ def _build_prompt(self, text, emotion, context):
163
+ ctx = "".join(f"[U]{t.user_text}[A]{t.ai_response_text} " for t in context[-3:])
164
+ mods = {"happy":"(cheerful)","sad":"(sympathetic)","angry":"(calming)",
165
+ "fear":"(reassuring)","surprise":"(excited)","neutral":""}
166
+ return f"{ctx}[U]{text}[A]{mods.get(emotion,'')} As a supernatural AI, I sense your {emotion} energy. "
167
+
168
+ def get_history(self, session_id: str) -> str:
169
+ ctx = self.conversation_manager.get_context(session_id, last_n=10)
170
+ if not ctx:
171
+ return "No history."
172
+ out = ""
173
+ for i, t in enumerate(ctx,1):
174
+ out += f"Turn {i} — You: {t.user_text} | AI: {t.ai_response_text} | Emotion: {t.emotion}\n\n"
175
+ return out
176
+
177
+ def clear_history(self, session_id: str) -> str:
178
+ self.conversation_manager.clear_session(session_id)
179
+ return "History cleared."
180
 
181
+ # Instantiate and launch Gradio app
182
+ ai = SupernaturalAI()
 
 
 
183
 
184
+ with gr.Blocks() as demo:
185
+ audio_in = gr.Audio(source="microphone", type="numpy", label="Speak")
186
+ audio_out = gr.Audio(label="AI Response")
187
+ session = gr.Textbox(label="Session ID", interactive=True)
188
+ status = gr.Textbox(label="Status")
189
+ chat = gr.Markdown("## Conversation")
190
 
191
+ btn = gr.Button("Send")
192
+ btn.click(fn=lambda a, s: ai.process_audio_input(a, s),
193
+ inputs=[audio_in, session],
194
+ outputs=[audio_out, status, chat, session])
195
+
196
+ hist_btn = gr.Button("History")
197
+ hist_btn.click(fn=lambda s: ai.get_history(s), inputs=session, outputs=chat)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
+ clr_btn = gr.Button("Clear")
200
+ clr_btn.click(fn=lambda s: ai.clear_history(s), inputs=session, outputs=chat)
 
 
 
 
201
 
202
+ demo.queue(concurrency_count=20, max_size=100)
203
+ demo.launch(server_name="0.0.0.0", server_port=7860, enable_queue=True)