Spaces:
Building
Building
Update websocket_handler.py
Browse files- websocket_handler.py +115 -115
websocket_handler.py
CHANGED
@@ -369,7 +369,7 @@ class RealtimeSession:
|
|
369 |
log_info(f"✅ Reset for new utterance complete", session_id=self.session.session_id)
|
370 |
|
371 |
|
372 |
-
# =========================
|
373 |
async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
374 |
"""Handle control messages"""
|
375 |
action = message.get("action")
|
@@ -456,6 +456,120 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
|
|
456 |
await session.stop_stt_streaming()
|
457 |
await session.restart_stt_if_needed()
|
458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
459 |
# ========================= MAIN HANDLER =========================
|
460 |
async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
461 |
"""Main WebSocket endpoint for real-time conversation"""
|
@@ -674,120 +788,6 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
|
674 |
except Exception as e:
|
675 |
log_debug(f"WebSocket already closed or error during close: {e}", session_id=session_id)
|
676 |
|
677 |
-
# ========================= MESSAGE HANDLERS =========================
|
678 |
-
async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
679 |
-
"""Handle incoming audio chunk with sequential processing"""
|
680 |
-
try:
|
681 |
-
# WebSocket kapandıysa işlem yapma
|
682 |
-
if not session.is_websocket_active:
|
683 |
-
return
|
684 |
-
|
685 |
-
audio_data = message.get("data")
|
686 |
-
if not audio_data:
|
687 |
-
log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
|
688 |
-
return
|
689 |
-
|
690 |
-
# TTS/LLM işlenirken audio chunk'ları tamamen yoksay
|
691 |
-
if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS,
|
692 |
-
ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]:
|
693 |
-
log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id)
|
694 |
-
return
|
695 |
-
|
696 |
-
# LISTENING state'inde değilse audio işleme
|
697 |
-
if session.state != ConversationState.LISTENING:
|
698 |
-
log_warning(f"⚠️ Audio received in unexpected state: {session.state.value}", session_id=session.session.session_id)
|
699 |
-
return
|
700 |
-
|
701 |
-
# STT yoksa veya streaming değilse hata döndür
|
702 |
-
if not session.stt_manager or not session.is_streaming:
|
703 |
-
log_warning(f"⚠️ STT not ready, attempting to restart", session_id=session.session.session_id)
|
704 |
-
await websocket.send_json({
|
705 |
-
"type": "error",
|
706 |
-
"error_type": "stt_not_ready",
|
707 |
-
"message": "STT is not ready. Waiting for initialization..."
|
708 |
-
})
|
709 |
-
return
|
710 |
-
|
711 |
-
# Add to buffer
|
712 |
-
await session.audio_buffer.add_chunk(audio_data)
|
713 |
-
|
714 |
-
# Decode for processing
|
715 |
-
decoded_audio = base64.b64decode(audio_data)
|
716 |
-
|
717 |
-
# Check silence
|
718 |
-
silence_duration = session.silence_detector.update(decoded_audio)
|
719 |
-
|
720 |
-
# Stream to STT
|
721 |
-
try:
|
722 |
-
# Chunk counter artır
|
723 |
-
session.chunk_counter += 1
|
724 |
-
|
725 |
-
if session.chunk_counter == 1:
|
726 |
-
log_info(f"🎤 Started streaming audio to STT", session_id=session.session.session_id)
|
727 |
-
# İlk chunk'ta format kontrolü yap
|
728 |
-
if len(decoded_audio) >= 4:
|
729 |
-
if decoded_audio[:4] == b'\x1a\x45\xdf\xa3':
|
730 |
-
log_info(f"✅ Valid WEBM header detected", session_id=session.session.session_id)
|
731 |
-
else:
|
732 |
-
log_warning(f"⚠️ Unknown audio format, first 4 bytes: {decoded_audio[:4].hex()}", session_id=session.session.session_id)
|
733 |
-
elif session.chunk_counter % 100 == 0:
|
734 |
-
log_info(f"📊 Sent {session.chunk_counter} chunks to STT so far...", session_id=session.session.session_id)
|
735 |
-
|
736 |
-
# STT'ye gönder ve sonuçları bekle
|
737 |
-
async for result in session.stt_manager.stream_audio(decoded_audio):
|
738 |
-
# SADECE FINAL RESULT'LARI İŞLE
|
739 |
-
if result.is_final:
|
740 |
-
log_info(f"✅ FINAL TRANSCRIPTION: '{result.text}'", session_id=session.session.session_id)
|
741 |
-
|
742 |
-
# Send ONLY final transcription to frontend
|
743 |
-
await websocket.send_json({
|
744 |
-
"type": "transcription",
|
745 |
-
"text": result.text,
|
746 |
-
"is_final": True,
|
747 |
-
"confidence": result.confidence
|
748 |
-
})
|
749 |
-
|
750 |
-
session.current_transcription = result.text
|
751 |
-
|
752 |
-
# Final transcription geldiğinde STT'yi durdur ve işle
|
753 |
-
if session.current_transcription:
|
754 |
-
# Önce STT'yi durdur
|
755 |
-
await session.stop_stt_streaming()
|
756 |
-
|
757 |
-
# State'i değiştir
|
758 |
-
await session.change_state(ConversationState.PROCESSING_STT)
|
759 |
-
await websocket.send_json({
|
760 |
-
"type": "state_change",
|
761 |
-
"from": "listening",
|
762 |
-
"to": "processing_stt"
|
763 |
-
})
|
764 |
-
|
765 |
-
# Process user input
|
766 |
-
await process_user_input(websocket, session)
|
767 |
-
return
|
768 |
-
|
769 |
-
except Exception as e:
|
770 |
-
error_msg = str(e)
|
771 |
-
# Google STT timeout hatası kontrolü
|
772 |
-
if "Audio Timeout Error" in error_msg or "stream duration" in error_msg or "Exceeded maximum allowed stream duration" in error_msg:
|
773 |
-
log_warning(f"⚠️ STT timeout detected, ignoring", session_id=session.session.session_id)
|
774 |
-
# Timeout durumunda STT'yi yeniden başlatmaya gerek yok,
|
775 |
-
# çünkü kullanıcı konuşmayı bitirdiğinde zaten yeniden başlatılacak
|
776 |
-
else:
|
777 |
-
log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
|
778 |
-
await websocket.send_json({
|
779 |
-
"type": "error",
|
780 |
-
"error_type": "stt_error",
|
781 |
-
"message": f"STT error: {str(e)}"
|
782 |
-
})
|
783 |
-
|
784 |
-
except Exception as e:
|
785 |
-
log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
|
786 |
-
await websocket.send_json({
|
787 |
-
"type": "error",
|
788 |
-
"error_type": "audio_error",
|
789 |
-
"message": f"Audio processing error: {str(e)}"
|
790 |
-
})
|
791 |
|
792 |
# ========================= PROCESSING FUNCTIONS =========================
|
793 |
async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
|
|
369 |
log_info(f"✅ Reset for new utterance complete", session_id=self.session.session_id)
|
370 |
|
371 |
|
372 |
+
# ========================= MESSAGE HANDLERS =========================
|
373 |
async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
374 |
"""Handle control messages"""
|
375 |
action = message.get("action")
|
|
|
456 |
await session.stop_stt_streaming()
|
457 |
await session.restart_stt_if_needed()
|
458 |
|
459 |
+
async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
460 |
+
"""Handle incoming audio chunk with sequential processing"""
|
461 |
+
try:
|
462 |
+
# WebSocket kapandıysa işlem yapma
|
463 |
+
if not session.is_websocket_active:
|
464 |
+
return
|
465 |
+
|
466 |
+
audio_data = message.get("data")
|
467 |
+
if not audio_data:
|
468 |
+
log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
|
469 |
+
return
|
470 |
+
|
471 |
+
# TTS/LLM işlenirken audio chunk'ları tamamen yoksay
|
472 |
+
if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS,
|
473 |
+
ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]:
|
474 |
+
log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id)
|
475 |
+
return
|
476 |
+
|
477 |
+
# LISTENING state'inde değilse audio işleme
|
478 |
+
if session.state != ConversationState.LISTENING:
|
479 |
+
log_warning(f"⚠️ Audio received in unexpected state: {session.state.value}", session_id=session.session.session_id)
|
480 |
+
return
|
481 |
+
|
482 |
+
# STT yoksa veya streaming değilse hata döndür
|
483 |
+
if not session.stt_manager or not session.is_streaming:
|
484 |
+
log_warning(f"⚠️ STT not ready, attempting to restart", session_id=session.session.session_id)
|
485 |
+
await websocket.send_json({
|
486 |
+
"type": "error",
|
487 |
+
"error_type": "stt_not_ready",
|
488 |
+
"message": "STT is not ready. Waiting for initialization..."
|
489 |
+
})
|
490 |
+
return
|
491 |
+
|
492 |
+
# Add to buffer
|
493 |
+
await session.audio_buffer.add_chunk(audio_data)
|
494 |
+
|
495 |
+
# Decode for processing
|
496 |
+
decoded_audio = base64.b64decode(audio_data)
|
497 |
+
|
498 |
+
# Check silence
|
499 |
+
silence_duration = session.silence_detector.update(decoded_audio)
|
500 |
+
|
501 |
+
# Stream to STT
|
502 |
+
try:
|
503 |
+
# Chunk counter artır
|
504 |
+
session.chunk_counter += 1
|
505 |
+
|
506 |
+
if session.chunk_counter == 1:
|
507 |
+
log_info(f"🎤 Started streaming audio to STT", session_id=session.session.session_id)
|
508 |
+
# İlk chunk'ta format kontrolü yap
|
509 |
+
if len(decoded_audio) >= 4:
|
510 |
+
if decoded_audio[:4] == b'\x1a\x45\xdf\xa3':
|
511 |
+
log_info(f"✅ Valid WEBM header detected", session_id=session.session.session_id)
|
512 |
+
else:
|
513 |
+
log_warning(f"⚠️ Unknown audio format, first 4 bytes: {decoded_audio[:4].hex()}", session_id=session.session.session_id)
|
514 |
+
elif session.chunk_counter % 100 == 0:
|
515 |
+
log_info(f"📊 Sent {session.chunk_counter} chunks to STT so far...", session_id=session.session.session_id)
|
516 |
+
|
517 |
+
# STT'ye gönder ve sonuçları bekle
|
518 |
+
async for result in session.stt_manager.stream_audio(decoded_audio):
|
519 |
+
# SADECE FINAL RESULT'LARI İŞLE
|
520 |
+
if result.is_final:
|
521 |
+
log_info(f"✅ FINAL TRANSCRIPTION: '{result.text}'", session_id=session.session.session_id)
|
522 |
+
|
523 |
+
# Send ONLY final transcription to frontend
|
524 |
+
await websocket.send_json({
|
525 |
+
"type": "transcription",
|
526 |
+
"text": result.text,
|
527 |
+
"is_final": True,
|
528 |
+
"confidence": result.confidence
|
529 |
+
})
|
530 |
+
|
531 |
+
session.current_transcription = result.text
|
532 |
+
|
533 |
+
# Final transcription geldiğinde STT'yi durdur ve işle
|
534 |
+
if session.current_transcription:
|
535 |
+
# Önce STT'yi durdur
|
536 |
+
await session.stop_stt_streaming()
|
537 |
+
|
538 |
+
# State'i değiştir
|
539 |
+
await session.change_state(ConversationState.PROCESSING_STT)
|
540 |
+
await websocket.send_json({
|
541 |
+
"type": "state_change",
|
542 |
+
"from": "listening",
|
543 |
+
"to": "processing_stt"
|
544 |
+
})
|
545 |
+
|
546 |
+
# Process user input
|
547 |
+
await process_user_input(websocket, session)
|
548 |
+
return
|
549 |
+
|
550 |
+
except Exception as e:
|
551 |
+
error_msg = str(e)
|
552 |
+
# Google STT timeout hatası kontrolü
|
553 |
+
if "Audio Timeout Error" in error_msg or "stream duration" in error_msg or "Exceeded maximum allowed stream duration" in error_msg:
|
554 |
+
log_warning(f"⚠️ STT timeout detected, ignoring", session_id=session.session.session_id)
|
555 |
+
# Timeout durumunda STT'yi yeniden başlatmaya gerek yok,
|
556 |
+
# çünkü kullanıcı konuşmayı bitirdiğinde zaten yeniden başlatılacak
|
557 |
+
else:
|
558 |
+
log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
|
559 |
+
await websocket.send_json({
|
560 |
+
"type": "error",
|
561 |
+
"error_type": "stt_error",
|
562 |
+
"message": f"STT error: {str(e)}"
|
563 |
+
})
|
564 |
+
|
565 |
+
except Exception as e:
|
566 |
+
log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
|
567 |
+
await websocket.send_json({
|
568 |
+
"type": "error",
|
569 |
+
"error_type": "audio_error",
|
570 |
+
"message": f"Audio processing error: {str(e)}"
|
571 |
+
})
|
572 |
+
|
573 |
# ========================= MAIN HANDLER =========================
|
574 |
async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
575 |
"""Main WebSocket endpoint for real-time conversation"""
|
|
|
788 |
except Exception as e:
|
789 |
log_debug(f"WebSocket already closed or error during close: {e}", session_id=session_id)
|
790 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
791 |
|
792 |
# ========================= PROCESSING FUNCTIONS =========================
|
793 |
async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|