Spaces:
Building
Building
Update websocket_handler.py
Browse files- websocket_handler.py +125 -121
websocket_handler.py
CHANGED
@@ -12,13 +12,13 @@ from enum import Enum
|
|
12 |
import numpy as np
|
13 |
import traceback
|
14 |
|
15 |
-
from realtime_session_manager import send_tts_welcome_message
|
16 |
from session import Session, session_store
|
17 |
from config_provider import ConfigProvider
|
18 |
from chat_handler import handle_new_message, handle_parameter_followup
|
19 |
from stt_factory import STTFactory
|
20 |
from tts_factory import TTSFactory
|
21 |
from logger import log_info, log_error, log_debug, log_warning
|
|
|
22 |
|
23 |
# ========================= CONSTANTS =========================
|
24 |
# Default values - will be overridden by config
|
@@ -180,39 +180,21 @@ class RealtimeSession:
|
|
180 |
"""Initialize STT provider"""
|
181 |
try:
|
182 |
self.stt_manager = STTFactory.create_provider()
|
183 |
-
if self.stt_manager
|
184 |
config = ConfigProvider.get().global_config.stt_provider.settings
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
sample_rate=16000,
|
194 |
-
encoding="WEBM_OPUS",
|
195 |
-
model=config.get("model", "latest_long"),
|
196 |
-
use_enhanced=config.get("use_enhanced", True),
|
197 |
-
# Voice Activity Detection
|
198 |
-
vad_enabled=True,
|
199 |
-
speech_timeout_ms=config.get("speech_timeout_ms", 2000),
|
200 |
-
# Noise reduction
|
201 |
-
noise_reduction_enabled=True,
|
202 |
-
noise_reduction_level=config.get("noise_reduction_level", 2)
|
203 |
-
)
|
204 |
-
|
205 |
-
await self.stt_manager.start_streaming(stt_config)
|
206 |
log_info("STT manager initialized", session_id=self.session.session_id)
|
207 |
return True
|
208 |
-
else:
|
209 |
-
log_warning("STT provider does not support realtime", session_id=self.session.session_id)
|
210 |
-
return False
|
211 |
except Exception as e:
|
212 |
log_error(f"Failed to initialize STT", error=str(e), session_id=self.session.session_id)
|
213 |
-
|
214 |
-
self.stt_manager = None
|
215 |
-
return False
|
216 |
|
217 |
async def change_state(self, new_state: ConversationState):
|
218 |
"""Change conversation state"""
|
@@ -248,12 +230,15 @@ class RealtimeSession:
|
|
248 |
# ========================= MAIN HANDLER =========================
|
249 |
async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
250 |
"""Main WebSocket endpoint for real-time conversation"""
|
|
|
|
|
251 |
await websocket.accept()
|
252 |
-
log_info(f"WebSocket
|
253 |
|
254 |
# Get session
|
255 |
session = session_store.get_session(session_id)
|
256 |
if not session:
|
|
|
257 |
await websocket.send_json({
|
258 |
"type": "error",
|
259 |
"message": "Session not found"
|
@@ -261,6 +246,8 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
|
261 |
await websocket.close()
|
262 |
return
|
263 |
|
|
|
|
|
264 |
# Mark as realtime session
|
265 |
session.is_realtime = True
|
266 |
session_store.update_session(session)
|
@@ -269,54 +256,59 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
|
269 |
realtime_session = RealtimeSession(session)
|
270 |
|
271 |
# Initialize STT
|
|
|
272 |
stt_initialized = await realtime_session.initialize_stt()
|
273 |
if not stt_initialized:
|
|
|
274 |
await websocket.send_json({
|
275 |
"type": "error",
|
276 |
"message": "STT initialization failed"
|
277 |
})
|
|
|
|
|
278 |
|
279 |
-
#
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
welcome_text = msg.get('content', '')
|
289 |
-
break
|
290 |
-
|
291 |
-
if welcome_text:
|
292 |
-
await send_tts_welcome_message(
|
293 |
-
websocket,
|
294 |
-
session_id,
|
295 |
-
tts_provider,
|
296 |
-
welcome_text
|
297 |
-
)
|
298 |
|
299 |
try:
|
300 |
while True:
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
|
308 |
-
|
309 |
-
|
310 |
|
311 |
-
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
|
315 |
-
except WebSocketDisconnect:
|
316 |
-
log_info(f"WebSocket disconnected", session_id=session_id)
|
317 |
except Exception as e:
|
318 |
log_error(
|
319 |
-
f"WebSocket error",
|
320 |
error=str(e),
|
321 |
traceback=traceback.format_exc(),
|
322 |
session_id=session_id
|
@@ -326,6 +318,7 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
|
326 |
"message": str(e)
|
327 |
})
|
328 |
finally:
|
|
|
329 |
await realtime_session.cleanup()
|
330 |
|
331 |
|
@@ -335,18 +328,9 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
335 |
try:
|
336 |
audio_data = message.get("data")
|
337 |
if not audio_data:
|
|
|
338 |
return
|
339 |
-
|
340 |
-
# STT manager kontrolü
|
341 |
-
if not session.stt_manager:
|
342 |
-
log_warning("No STT manager available, ignoring audio chunk", session_id=session.session.session_id)
|
343 |
-
await websocket.send_json({
|
344 |
-
"type": "error",
|
345 |
-
"message": "Speech recognition not available",
|
346 |
-
"error_type": "stt_unavailable"
|
347 |
-
})
|
348 |
-
return
|
349 |
-
|
350 |
# Check for barge-in during TTS/audio playback
|
351 |
if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS]:
|
352 |
await session.handle_barge_in()
|
@@ -354,7 +338,7 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
354 |
"type": "control",
|
355 |
"action": "stop_playback"
|
356 |
})
|
357 |
-
log_info(f"Barge-in detected", session_id=session.session.session_id, state=session.state.value)
|
358 |
|
359 |
# Change state to listening if idle
|
360 |
if session.state == ConversationState.IDLE:
|
@@ -387,11 +371,12 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
387 |
|
388 |
if result.is_final:
|
389 |
session.current_transcription = result.text
|
|
|
390 |
|
391 |
# Process if silence detected and we have transcription
|
392 |
if silence_duration > session.silence_threshold_ms and session.current_transcription:
|
393 |
log_info(
|
394 |
-
f"User stopped speaking",
|
395 |
session_id=session.session.session_id,
|
396 |
silence_ms=silence_duration,
|
397 |
text=session.current_transcription
|
@@ -400,7 +385,7 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
400 |
|
401 |
except Exception as e:
|
402 |
log_error(
|
403 |
-
f"Audio chunk handling error",
|
404 |
error=str(e),
|
405 |
traceback=traceback.format_exc(),
|
406 |
session_id=session.session.session_id
|
@@ -416,12 +401,12 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
|
|
416 |
action = message.get("action")
|
417 |
config = message.get("config", {})
|
418 |
|
419 |
-
log_debug(f"Control message", action=action, session_id=session.session.session_id)
|
420 |
|
421 |
if action == "start_session":
|
422 |
# Session configuration
|
423 |
await websocket.send_json({
|
424 |
-
"type": "
|
425 |
"session_id": session.session.session_id,
|
426 |
"config": {
|
427 |
"silence_threshold_ms": session.silence_threshold_ms,
|
@@ -430,38 +415,38 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
|
|
430 |
}
|
431 |
})
|
432 |
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
|
466 |
|
467 |
# ========================= PROCESSING FUNCTIONS =========================
|
@@ -470,11 +455,12 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
|
470 |
try:
|
471 |
user_text = session.current_transcription
|
472 |
if not user_text:
|
|
|
473 |
await session.reset_for_new_utterance()
|
474 |
await session.change_state(ConversationState.IDLE)
|
475 |
return
|
476 |
|
477 |
-
log_info(f"Processing user input", text=user_text, session_id=session.session.session_id)
|
478 |
|
479 |
# State: STT Processing
|
480 |
await session.change_state(ConversationState.PROCESSING_STT)
|
@@ -504,11 +490,15 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
|
504 |
session.session.add_message("user", user_text)
|
505 |
|
506 |
# Get LLM response based on session state
|
|
|
|
|
507 |
if session.session.state == "collect_params":
|
508 |
response_text = await handle_parameter_followup(session.session, user_text)
|
509 |
else:
|
510 |
response_text = await handle_new_message(session.session, user_text)
|
511 |
|
|
|
|
|
512 |
# Add response to history
|
513 |
session.session.add_message("assistant", response_text)
|
514 |
|
@@ -529,14 +519,14 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
|
529 |
})
|
530 |
|
531 |
# Generate TTS with barge-in support
|
532 |
-
tts_task = session.barge_in_handler.start_tts_task(
|
533 |
generate_and_stream_tts(websocket, session, tts_provider, response_text)
|
534 |
)
|
535 |
|
536 |
try:
|
537 |
await tts_task
|
538 |
except asyncio.CancelledError:
|
539 |
-
log_info("TTS cancelled due to barge-in", session_id=session.session.session_id)
|
540 |
else:
|
541 |
# No TTS, go back to idle
|
542 |
await session.change_state(ConversationState.IDLE)
|
@@ -551,7 +541,7 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
|
551 |
|
552 |
except Exception as e:
|
553 |
log_error(
|
554 |
-
f"Error processing user input",
|
555 |
error=str(e),
|
556 |
traceback=traceback.format_exc(),
|
557 |
session_id=session.session.session_id
|
@@ -572,8 +562,11 @@ async def generate_and_stream_tts(
|
|
572 |
):
|
573 |
"""Generate and stream TTS audio with cancellation support"""
|
574 |
try:
|
|
|
|
|
575 |
# Generate audio
|
576 |
audio_data = await tts_provider.synthesize(text)
|
|
|
577 |
|
578 |
# Change state to playing
|
579 |
await session.change_state(ConversationState.PLAYING_AUDIO)
|
@@ -585,29 +578,38 @@ async def generate_and_stream_tts(
|
|
585 |
|
586 |
# Convert entire audio to base64 for transmission
|
587 |
import base64
|
|
|
588 |
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
|
|
|
|
|
|
|
|
589 |
|
590 |
# Stream audio in chunks
|
591 |
chunk_size = 16384 # Larger chunk size for base64
|
592 |
total_length = len(audio_base64)
|
593 |
total_chunks = (total_length + chunk_size - 1) // chunk_size
|
594 |
|
595 |
-
log_info(f"Streaming TTS audio: {len(audio_data)} bytes as {total_length} base64 chars in {total_chunks} chunks")
|
596 |
|
597 |
for i in range(0, total_length, chunk_size):
|
598 |
# Check for cancellation
|
599 |
if asyncio.current_task().cancelled():
|
|
|
600 |
break
|
601 |
|
602 |
chunk = audio_base64[i:i + chunk_size]
|
603 |
chunk_index = i // chunk_size
|
|
|
|
|
|
|
604 |
|
605 |
await websocket.send_json({
|
606 |
"type": "tts_audio",
|
607 |
"data": chunk,
|
608 |
"chunk_index": chunk_index,
|
609 |
"total_chunks": total_chunks,
|
610 |
-
"is_last":
|
611 |
"mime_type": "audio/mpeg"
|
612 |
})
|
613 |
|
@@ -615,19 +617,21 @@ async def generate_and_stream_tts(
|
|
615 |
await asyncio.sleep(0.01)
|
616 |
|
617 |
log_info(
|
618 |
-
f"TTS streaming completed",
|
619 |
session_id=session.session.session_id,
|
620 |
text_length=len(text),
|
621 |
-
audio_size=len(audio_data)
|
|
|
622 |
)
|
623 |
|
624 |
except asyncio.CancelledError:
|
625 |
-
log_info("TTS streaming cancelled", session_id=session.session.session_id)
|
626 |
raise
|
627 |
except Exception as e:
|
628 |
log_error(
|
629 |
-
f"TTS generation error",
|
630 |
error=str(e),
|
|
|
631 |
session_id=session.session.session_id
|
632 |
)
|
633 |
await websocket.send_json({
|
|
|
12 |
import numpy as np
|
13 |
import traceback
|
14 |
|
|
|
15 |
from session import Session, session_store
|
16 |
from config_provider import ConfigProvider
|
17 |
from chat_handler import handle_new_message, handle_parameter_followup
|
18 |
from stt_factory import STTFactory
|
19 |
from tts_factory import TTSFactory
|
20 |
from logger import log_info, log_error, log_debug, log_warning
|
21 |
+
from realtime_session_manager import send_tts_welcome_message
|
22 |
|
23 |
# ========================= CONSTANTS =========================
|
24 |
# Default values - will be overridden by config
|
|
|
180 |
"""Initialize STT provider"""
|
181 |
try:
|
182 |
self.stt_manager = STTFactory.create_provider()
|
183 |
+
if self.stt_manager:
|
184 |
config = ConfigProvider.get().global_config.stt_provider.settings
|
185 |
+
await self.stt_manager.start_streaming({
|
186 |
+
"language": config.get("language", "tr-TR"),
|
187 |
+
"interim_results": config.get("interim_results", True),
|
188 |
+
"single_utterance": False,
|
189 |
+
"enable_punctuation": config.get("enable_punctuation", True),
|
190 |
+
"sample_rate": 16000,
|
191 |
+
"encoding": "WEBM_OPUS"
|
192 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
log_info("STT manager initialized", session_id=self.session.session_id)
|
194 |
return True
|
|
|
|
|
|
|
195 |
except Exception as e:
|
196 |
log_error(f"Failed to initialize STT", error=str(e), session_id=self.session.session_id)
|
197 |
+
return False
|
|
|
|
|
198 |
|
199 |
async def change_state(self, new_state: ConversationState):
|
200 |
"""Change conversation state"""
|
|
|
230 |
# ========================= MAIN HANDLER =========================
|
231 |
async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
232 |
"""Main WebSocket endpoint for real-time conversation"""
|
233 |
+
log_info(f"🔌 WebSocket connection attempt", session_id=session_id)
|
234 |
+
|
235 |
await websocket.accept()
|
236 |
+
log_info(f"✅ WebSocket accepted", session_id=session_id)
|
237 |
|
238 |
# Get session
|
239 |
session = session_store.get_session(session_id)
|
240 |
if not session:
|
241 |
+
log_error(f"❌ Session not found", session_id=session_id)
|
242 |
await websocket.send_json({
|
243 |
"type": "error",
|
244 |
"message": "Session not found"
|
|
|
246 |
await websocket.close()
|
247 |
return
|
248 |
|
249 |
+
log_info(f"✅ Session found", session_id=session_id, project=session.project_name)
|
250 |
+
|
251 |
# Mark as realtime session
|
252 |
session.is_realtime = True
|
253 |
session_store.update_session(session)
|
|
|
256 |
realtime_session = RealtimeSession(session)
|
257 |
|
258 |
# Initialize STT
|
259 |
+
log_info(f"🎤 Initializing STT...", session_id=session_id)
|
260 |
stt_initialized = await realtime_session.initialize_stt()
|
261 |
if not stt_initialized:
|
262 |
+
log_error(f"❌ STT initialization failed", session_id=session_id)
|
263 |
await websocket.send_json({
|
264 |
"type": "error",
|
265 |
"message": "STT initialization failed"
|
266 |
})
|
267 |
+
else:
|
268 |
+
log_info(f"✅ STT initialized", session_id=session_id)
|
269 |
|
270 |
+
# Send session started confirmation
|
271 |
+
await websocket.send_json({
|
272 |
+
"type": "session_started",
|
273 |
+
"session_id": session_id,
|
274 |
+
"stt_initialized": stt_initialized
|
275 |
+
})
|
276 |
+
|
277 |
+
# Don't send welcome TTS here - it's already sent by the frontend
|
278 |
+
log_info(f"💬 Ready for conversation", session_id=session_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
|
280 |
try:
|
281 |
while True:
|
282 |
+
try:
|
283 |
+
# Receive message with timeout
|
284 |
+
message = await asyncio.wait_for(
|
285 |
+
websocket.receive_json(),
|
286 |
+
timeout=60.0 # 60 second timeout
|
287 |
+
)
|
288 |
|
289 |
+
message_type = message.get("type")
|
290 |
+
log_debug(f"📨 Received message type: {message_type}", session_id=session_id)
|
291 |
|
292 |
+
if message_type == "audio_chunk":
|
293 |
+
await handle_audio_chunk(websocket, realtime_session, message)
|
294 |
+
|
295 |
+
elif message_type == "control":
|
296 |
+
await handle_control_message(websocket, realtime_session, message)
|
297 |
+
|
298 |
+
elif message_type == "ping":
|
299 |
+
# Keep-alive ping
|
300 |
+
await websocket.send_json({"type": "pong"})
|
301 |
+
log_debug(f"🏓 Ping-pong", session_id=session_id)
|
302 |
+
|
303 |
+
except asyncio.TimeoutError:
|
304 |
+
log_warning(f"⏱️ WebSocket timeout - sending ping", session_id=session_id)
|
305 |
+
await websocket.send_json({"type": "ping"})
|
306 |
|
307 |
+
except WebSocketDisconnect as e:
|
308 |
+
log_info(f"🔌 WebSocket disconnected", session_id=session_id, code=e.code, reason=e.reason)
|
309 |
except Exception as e:
|
310 |
log_error(
|
311 |
+
f"❌ WebSocket error",
|
312 |
error=str(e),
|
313 |
traceback=traceback.format_exc(),
|
314 |
session_id=session_id
|
|
|
318 |
"message": str(e)
|
319 |
})
|
320 |
finally:
|
321 |
+
log_info(f"🧹 Cleaning up WebSocket connection", session_id=session_id)
|
322 |
await realtime_session.cleanup()
|
323 |
|
324 |
|
|
|
328 |
try:
|
329 |
audio_data = message.get("data")
|
330 |
if not audio_data:
|
331 |
+
log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
|
332 |
return
|
333 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
# Check for barge-in during TTS/audio playback
|
335 |
if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS]:
|
336 |
await session.handle_barge_in()
|
|
|
338 |
"type": "control",
|
339 |
"action": "stop_playback"
|
340 |
})
|
341 |
+
log_info(f"🛑 Barge-in detected", session_id=session.session.session_id, state=session.state.value)
|
342 |
|
343 |
# Change state to listening if idle
|
344 |
if session.state == ConversationState.IDLE:
|
|
|
371 |
|
372 |
if result.is_final:
|
373 |
session.current_transcription = result.text
|
374 |
+
log_info(f"📝 Final transcription: {result.text}", session_id=session.session.session_id)
|
375 |
|
376 |
# Process if silence detected and we have transcription
|
377 |
if silence_duration > session.silence_threshold_ms and session.current_transcription:
|
378 |
log_info(
|
379 |
+
f"🔇 User stopped speaking",
|
380 |
session_id=session.session.session_id,
|
381 |
silence_ms=silence_duration,
|
382 |
text=session.current_transcription
|
|
|
385 |
|
386 |
except Exception as e:
|
387 |
log_error(
|
388 |
+
f"❌ Audio chunk handling error",
|
389 |
error=str(e),
|
390 |
traceback=traceback.format_exc(),
|
391 |
session_id=session.session.session_id
|
|
|
401 |
action = message.get("action")
|
402 |
config = message.get("config", {})
|
403 |
|
404 |
+
log_debug(f"🎮 Control message", action=action, session_id=session.session.session_id)
|
405 |
|
406 |
if action == "start_session":
|
407 |
# Session configuration
|
408 |
await websocket.send_json({
|
409 |
+
"type": "session_config",
|
410 |
"session_id": session.session.session_id,
|
411 |
"config": {
|
412 |
"silence_threshold_ms": session.silence_threshold_ms,
|
|
|
415 |
}
|
416 |
})
|
417 |
|
418 |
+
elif action == "end_session" or action == "stop_session":
|
419 |
+
# Clean up and close
|
420 |
+
await session.cleanup()
|
421 |
+
await websocket.close()
|
422 |
+
|
423 |
+
elif action == "interrupt":
|
424 |
+
# Handle explicit interrupt
|
425 |
+
await session.handle_barge_in()
|
426 |
+
await websocket.send_json({
|
427 |
+
"type": "control",
|
428 |
+
"action": "interrupt_acknowledged"
|
429 |
+
})
|
430 |
+
|
431 |
+
elif action == "reset":
|
432 |
+
# Reset conversation state
|
433 |
+
await session.reset_for_new_utterance()
|
434 |
+
await session.change_state(ConversationState.IDLE)
|
435 |
+
await websocket.send_json({
|
436 |
+
"type": "state_change",
|
437 |
+
"from": session.state.value,
|
438 |
+
"to": "idle"
|
439 |
+
})
|
440 |
+
|
441 |
+
elif action == "audio_ended":
|
442 |
+
# Audio playback ended on client
|
443 |
+
if session.state == ConversationState.PLAYING_AUDIO:
|
444 |
+
await session.change_state(ConversationState.IDLE)
|
445 |
+
await websocket.send_json({
|
446 |
+
"type": "state_change",
|
447 |
+
"from": "playing_audio",
|
448 |
+
"to": "idle"
|
449 |
+
})
|
450 |
|
451 |
|
452 |
# ========================= PROCESSING FUNCTIONS =========================
|
|
|
455 |
try:
|
456 |
user_text = session.current_transcription
|
457 |
if not user_text:
|
458 |
+
log_warning(f"⚠️ Empty transcription, resetting", session_id=session.session.session_id)
|
459 |
await session.reset_for_new_utterance()
|
460 |
await session.change_state(ConversationState.IDLE)
|
461 |
return
|
462 |
|
463 |
+
log_info(f"🎯 Processing user input", text=user_text, session_id=session.session.session_id)
|
464 |
|
465 |
# State: STT Processing
|
466 |
await session.change_state(ConversationState.PROCESSING_STT)
|
|
|
490 |
session.session.add_message("user", user_text)
|
491 |
|
492 |
# Get LLM response based on session state
|
493 |
+
log_info(f"🤖 Getting LLM response", session_state=session.session.state, session_id=session.session.session_id)
|
494 |
+
|
495 |
if session.session.state == "collect_params":
|
496 |
response_text = await handle_parameter_followup(session.session, user_text)
|
497 |
else:
|
498 |
response_text = await handle_new_message(session.session, user_text)
|
499 |
|
500 |
+
log_info(f"💬 LLM response: {response_text[:50]}...", session_id=session.session.session_id)
|
501 |
+
|
502 |
# Add response to history
|
503 |
session.session.add_message("assistant", response_text)
|
504 |
|
|
|
519 |
})
|
520 |
|
521 |
# Generate TTS with barge-in support
|
522 |
+
tts_task = await session.barge_in_handler.start_tts_task(
|
523 |
generate_and_stream_tts(websocket, session, tts_provider, response_text)
|
524 |
)
|
525 |
|
526 |
try:
|
527 |
await tts_task
|
528 |
except asyncio.CancelledError:
|
529 |
+
log_info("⚡ TTS cancelled due to barge-in", session_id=session.session.session_id)
|
530 |
else:
|
531 |
# No TTS, go back to idle
|
532 |
await session.change_state(ConversationState.IDLE)
|
|
|
541 |
|
542 |
except Exception as e:
|
543 |
log_error(
|
544 |
+
f"❌ Error processing user input",
|
545 |
error=str(e),
|
546 |
traceback=traceback.format_exc(),
|
547 |
session_id=session.session.session_id
|
|
|
562 |
):
|
563 |
"""Generate and stream TTS audio with cancellation support"""
|
564 |
try:
|
565 |
+
log_info(f"🎤 Starting TTS generation for text: '{text[:50]}...'", session_id=session.session.session_id)
|
566 |
+
|
567 |
# Generate audio
|
568 |
audio_data = await tts_provider.synthesize(text)
|
569 |
+
log_info(f"✅ TTS generated: {len(audio_data)} bytes, type: {type(audio_data)}", session_id=session.session.session_id)
|
570 |
|
571 |
# Change state to playing
|
572 |
await session.change_state(ConversationState.PLAYING_AUDIO)
|
|
|
578 |
|
579 |
# Convert entire audio to base64 for transmission
|
580 |
import base64
|
581 |
+
log_debug(f"📦 Converting audio to base64...")
|
582 |
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
583 |
+
log_info(f"📊 Base64 conversion complete: {len(audio_base64)} chars from {len(audio_data)} bytes", session_id=session.session.session_id)
|
584 |
+
|
585 |
+
# Log first 100 chars of base64 to verify it's valid
|
586 |
+
log_debug(f"🔍 Base64 preview: {audio_base64[:100]}...")
|
587 |
|
588 |
# Stream audio in chunks
|
589 |
chunk_size = 16384 # Larger chunk size for base64
|
590 |
total_length = len(audio_base64)
|
591 |
total_chunks = (total_length + chunk_size - 1) // chunk_size
|
592 |
|
593 |
+
log_info(f"📤 Streaming TTS audio: {len(audio_data)} bytes as {total_length} base64 chars in {total_chunks} chunks", session_id=session.session.session_id)
|
594 |
|
595 |
for i in range(0, total_length, chunk_size):
|
596 |
# Check for cancellation
|
597 |
if asyncio.current_task().cancelled():
|
598 |
+
log_info(f"⚡ Streaming cancelled at chunk {i//chunk_size}", session_id=session.session.session_id)
|
599 |
break
|
600 |
|
601 |
chunk = audio_base64[i:i + chunk_size]
|
602 |
chunk_index = i // chunk_size
|
603 |
+
is_last = chunk_index == total_chunks - 1
|
604 |
+
|
605 |
+
log_debug(f"📨 Sending chunk {chunk_index}/{total_chunks}, size: {len(chunk)}, is_last: {is_last}")
|
606 |
|
607 |
await websocket.send_json({
|
608 |
"type": "tts_audio",
|
609 |
"data": chunk,
|
610 |
"chunk_index": chunk_index,
|
611 |
"total_chunks": total_chunks,
|
612 |
+
"is_last": is_last,
|
613 |
"mime_type": "audio/mpeg"
|
614 |
})
|
615 |
|
|
|
617 |
await asyncio.sleep(0.01)
|
618 |
|
619 |
log_info(
|
620 |
+
f"✅ TTS streaming completed successfully",
|
621 |
session_id=session.session.session_id,
|
622 |
text_length=len(text),
|
623 |
+
audio_size=len(audio_data),
|
624 |
+
chunks_sent=total_chunks
|
625 |
)
|
626 |
|
627 |
except asyncio.CancelledError:
|
628 |
+
log_info("🛑 TTS streaming cancelled", session_id=session.session.session_id)
|
629 |
raise
|
630 |
except Exception as e:
|
631 |
log_error(
|
632 |
+
f"❌ TTS generation error",
|
633 |
error=str(e),
|
634 |
+
traceback=traceback.format_exc(),
|
635 |
session_id=session.session.session_id
|
636 |
)
|
637 |
await websocket.send_json({
|