Spaces:
Running
Running
Update websocket_handler.py
Browse files- websocket_handler.py +101 -181
websocket_handler.py
CHANGED
@@ -405,18 +405,6 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
|
405 |
# Initialize conversation
|
406 |
realtime_session = RealtimeSession(session)
|
407 |
|
408 |
-
# Initialize STT
|
409 |
-
log_info(f"🎤 Initializing STT...", session_id=session_id)
|
410 |
-
stt_initialized = await realtime_session.initialize_stt()
|
411 |
-
if not stt_initialized:
|
412 |
-
log_error(f"❌ STT initialization failed", session_id=session_id)
|
413 |
-
await websocket.send_json({
|
414 |
-
"type": "error",
|
415 |
-
"message": "STT initialization failed"
|
416 |
-
})
|
417 |
-
else:
|
418 |
-
log_info(f"✅ STT initialized", session_id=session_id)
|
419 |
-
|
420 |
# Send session started confirmation
|
421 |
await websocket.send_json({
|
422 |
"type": "session_started",
|
@@ -495,14 +483,47 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
|
495 |
log_info(f"✅ Welcome TTS sent", session_id=session_id)
|
496 |
except Exception as e:
|
497 |
log_error(f"❌ Failed to send welcome TTS", error=str(e), traceback=traceback.format_exc(), session_id=session_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
else:
|
499 |
log_warning(f"⚠️ No TTS provider available", session_id=session_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
500 |
|
501 |
break
|
502 |
else:
|
503 |
log_warning(f"⚠️ No assistant message found in history", session_id=session_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
504 |
else:
|
505 |
log_warning(f"⚠️ No messages in session history", session_id=session_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
506 |
|
507 |
log_info(f"💬 Ready for conversation", session_id=session_id)
|
508 |
|
@@ -584,30 +605,25 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
584 |
log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
|
585 |
return
|
586 |
|
587 |
-
#
|
588 |
if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS,
|
589 |
ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]:
|
590 |
log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id)
|
591 |
-
# Audio buffer'ı da temizle ki eski chunk'lar birikmesin
|
592 |
-
await session.audio_buffer.clear()
|
593 |
return
|
594 |
|
595 |
-
# Change state to listening if idle
|
596 |
-
if session.state == ConversationState.IDLE:
|
597 |
-
# IDLE'dan LISTENING'e geçerken buffer'ı temizle
|
598 |
-
await session.audio_buffer.clear()
|
599 |
-
await session.change_state(ConversationState.LISTENING)
|
600 |
-
await websocket.send_json({
|
601 |
-
"type": "state_change",
|
602 |
-
"from": "idle",
|
603 |
-
"to": "listening"
|
604 |
-
})
|
605 |
-
# IDLE'dan LISTENING'e geçerken STT'yi başlat
|
606 |
-
if not session.is_streaming:
|
607 |
-
await session.restart_stt_if_needed()
|
608 |
-
|
609 |
# LISTENING state'inde değilse audio işleme
|
610 |
if session.state != ConversationState.LISTENING:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
611 |
return
|
612 |
|
613 |
# Add to buffer
|
@@ -619,92 +635,69 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
619 |
# Check silence
|
620 |
silence_duration = session.silence_detector.update(decoded_audio)
|
621 |
|
622 |
-
# Stream to STT
|
623 |
-
|
624 |
-
#
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
629 |
await websocket.send_json({
|
630 |
-
"type": "
|
631 |
-
"
|
632 |
-
"
|
|
|
633 |
})
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
log_info(f"🎤 Started streaming audio to STT", session_id=session.session.session_id)
|
642 |
-
# İlk chunk'ta format kontrolü yap
|
643 |
-
if len(decoded_audio) >= 4:
|
644 |
-
if decoded_audio[:4] == b'\x1a\x45\xdf\xa3':
|
645 |
-
log_info(f"✅ Valid WEBM header detected", session_id=session.session.session_id)
|
646 |
-
else:
|
647 |
-
log_warning(f"⚠️ Unknown audio format, first 4 bytes: {decoded_audio[:4].hex()}", session_id=session.session.session_id)
|
648 |
-
# Format hatalıysa buffer'ı temizle ve chunk counter'ı resetle
|
649 |
-
await session.audio_buffer.clear()
|
650 |
-
session.chunk_counter = 0
|
651 |
-
await session.stop_stt_streaming()
|
652 |
-
await session.restart_stt_if_needed()
|
653 |
-
return
|
654 |
-
elif session.chunk_counter % 100 == 0:
|
655 |
-
log_info(f"📊 Sent {session.chunk_counter} chunks to STT so far...", session_id=session.session.session_id)
|
656 |
-
|
657 |
-
# STT'ye gönder ve sonuçları bekle
|
658 |
-
async for result in session.stt_manager.stream_audio(decoded_audio):
|
659 |
-
# SADECE FINAL RESULT'LARI İŞLE
|
660 |
-
if result.is_final:
|
661 |
-
log_info(f"✅ FINAL TRANSCRIPTION: '{result.text}'", session_id=session.session.session_id)
|
662 |
|
663 |
-
#
|
|
|
664 |
await websocket.send_json({
|
665 |
-
"type": "
|
666 |
-
"
|
667 |
-
"
|
668 |
-
"confidence": result.confidence
|
669 |
})
|
670 |
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
if session.current_transcription:
|
675 |
-
# Önce STT'yi durdur
|
676 |
-
await session.stop_stt_streaming()
|
677 |
-
|
678 |
-
# State'i değiştir
|
679 |
-
await session.change_state(ConversationState.PROCESSING_STT)
|
680 |
-
await websocket.send_json({
|
681 |
-
"type": "state_change",
|
682 |
-
"from": "listening",
|
683 |
-
"to": "processing_stt"
|
684 |
-
})
|
685 |
-
|
686 |
-
# Process user input
|
687 |
-
await process_user_input(websocket, session)
|
688 |
-
return
|
689 |
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
"type": "error",
|
705 |
-
"error_type": "stt_error",
|
706 |
-
"message": f"STT error: {str(e)}"
|
707 |
-
})
|
708 |
|
709 |
except Exception as e:
|
710 |
log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
|
@@ -713,79 +706,6 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
713 |
"error_type": "audio_error",
|
714 |
"message": f"Audio processing error: {str(e)}"
|
715 |
})
|
716 |
-
|
717 |
-
async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
718 |
-
"""Handle control messages"""
|
719 |
-
action = message.get("action")
|
720 |
-
config = message.get("config", {})
|
721 |
-
|
722 |
-
log_debug(f"🎮 Control message", action=action, session_id=session.session.session_id)
|
723 |
-
|
724 |
-
if action == "start_session":
|
725 |
-
# Session configuration
|
726 |
-
await websocket.send_json({
|
727 |
-
"type": "session_config",
|
728 |
-
"session_id": session.session.session_id,
|
729 |
-
"config": {
|
730 |
-
"silence_threshold_ms": session.silence_threshold_ms,
|
731 |
-
"audio_chunk_size": session.audio_chunk_size,
|
732 |
-
"supports_barge_in": False # Barge-in devre dışı
|
733 |
-
}
|
734 |
-
})
|
735 |
-
|
736 |
-
elif action == "end_session" or action == "stop_session":
|
737 |
-
# Clean up and close
|
738 |
-
await session.cleanup()
|
739 |
-
await websocket.close()
|
740 |
-
|
741 |
-
elif action == "interrupt":
|
742 |
-
# Barge-in devre dışı - ignore
|
743 |
-
log_warning(f"⚠️ Interrupt request ignored (barge-in disabled)", session_id=session.session.session_id)
|
744 |
-
|
745 |
-
elif action == "reset":
|
746 |
-
# Reset conversation state
|
747 |
-
await session.reset_for_new_utterance()
|
748 |
-
await session.stop_stt_streaming()
|
749 |
-
await session.change_state(ConversationState.IDLE)
|
750 |
-
await websocket.send_json({
|
751 |
-
"type": "state_change",
|
752 |
-
"from": session.state.value,
|
753 |
-
"to": "idle"
|
754 |
-
})
|
755 |
-
|
756 |
-
elif action == "audio_ended":
|
757 |
-
# Audio playback ended on client
|
758 |
-
if session.state == ConversationState.PLAYING_AUDIO:
|
759 |
-
log_info(f"🎵 Client reported audio ended", session_id=session.session.session_id)
|
760 |
-
await session.change_state(ConversationState.LISTENING)
|
761 |
-
await websocket.send_json({
|
762 |
-
"type": "state_change",
|
763 |
-
"from": "playing_audio",
|
764 |
-
"to": "listening"
|
765 |
-
})
|
766 |
-
# STT'yi yeniden başlat
|
767 |
-
success = await session.restart_stt_if_needed()
|
768 |
-
|
769 |
-
# STT hazır olduğunda sinyal gönder
|
770 |
-
if success and session.is_streaming:
|
771 |
-
log_info(f"✅ Sending STT ready signal", session_id=session.session.session_id)
|
772 |
-
await websocket.send_json({
|
773 |
-
"type": "stt_ready",
|
774 |
-
"message": "STT is ready to receive audio"
|
775 |
-
})
|
776 |
-
else:
|
777 |
-
log_error(f"❌ STT not ready after restart", session_id=session.session.session_id)
|
778 |
-
await websocket.send_json({
|
779 |
-
"type": "error",
|
780 |
-
"error_type": "stt_init_failed",
|
781 |
-
"message": "Failed to initialize STT after audio playback"
|
782 |
-
})
|
783 |
-
|
784 |
-
elif action == "restart_stt":
|
785 |
-
# Manual STT restart request
|
786 |
-
log_info(f"🔄 Manual STT restart requested", session_id=session.session.session_id)
|
787 |
-
await session.stop_stt_streaming()
|
788 |
-
await session.restart_stt_if_needed()
|
789 |
|
790 |
# ========================= PROCESSING FUNCTIONS =========================
|
791 |
async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
|
|
405 |
# Initialize conversation
|
406 |
realtime_session = RealtimeSession(session)
|
407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
# Send session started confirmation
|
409 |
await websocket.send_json({
|
410 |
"type": "session_started",
|
|
|
483 |
log_info(f"✅ Welcome TTS sent", session_id=session_id)
|
484 |
except Exception as e:
|
485 |
log_error(f"❌ Failed to send welcome TTS", error=str(e), traceback=traceback.format_exc(), session_id=session_id)
|
486 |
+
# TTS hatası durumunda direkt listening moduna geç
|
487 |
+
await realtime_session.change_state(ConversationState.LISTENING)
|
488 |
+
await websocket.send_json({
|
489 |
+
"type": "state_change",
|
490 |
+
"from": "playing_audio",
|
491 |
+
"to": "listening"
|
492 |
+
})
|
493 |
+
# Ve STT'yi başlat
|
494 |
+
await realtime_session.initialize_stt()
|
495 |
else:
|
496 |
log_warning(f"⚠️ No TTS provider available", session_id=session_id)
|
497 |
+
# TTS yoksa direkt listening moduna geç ve STT başlat
|
498 |
+
await realtime_session.change_state(ConversationState.LISTENING)
|
499 |
+
await websocket.send_json({
|
500 |
+
"type": "state_change",
|
501 |
+
"from": "idle",
|
502 |
+
"to": "listening"
|
503 |
+
})
|
504 |
+
await realtime_session.initialize_stt()
|
505 |
|
506 |
break
|
507 |
else:
|
508 |
log_warning(f"⚠️ No assistant message found in history", session_id=session_id)
|
509 |
+
# Welcome mesajı yoksa direkt listening moduna geç
|
510 |
+
await realtime_session.change_state(ConversationState.LISTENING)
|
511 |
+
await websocket.send_json({
|
512 |
+
"type": "state_change",
|
513 |
+
"from": "idle",
|
514 |
+
"to": "listening"
|
515 |
+
})
|
516 |
+
await realtime_session.initialize_stt()
|
517 |
else:
|
518 |
log_warning(f"⚠️ No messages in session history", session_id=session_id)
|
519 |
+
# History yoksa direkt listening moduna geç
|
520 |
+
await realtime_session.change_state(ConversationState.LISTENING)
|
521 |
+
await websocket.send_json({
|
522 |
+
"type": "state_change",
|
523 |
+
"from": "idle",
|
524 |
+
"to": "listening"
|
525 |
+
})
|
526 |
+
await realtime_session.initialize_stt()
|
527 |
|
528 |
log_info(f"💬 Ready for conversation", session_id=session_id)
|
529 |
|
|
|
605 |
log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
|
606 |
return
|
607 |
|
608 |
+
# TTS/LLM işlenirken audio chunk'ları tamamen yoksay
|
609 |
if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS,
|
610 |
ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]:
|
611 |
log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id)
|
|
|
|
|
612 |
return
|
613 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
614 |
# LISTENING state'inde değilse audio işleme
|
615 |
if session.state != ConversationState.LISTENING:
|
616 |
+
log_warning(f"⚠️ Audio received in unexpected state: {session.state.value}", session_id=session.session.session_id)
|
617 |
+
return
|
618 |
+
|
619 |
+
# STT yoksa veya streaming değilse hata döndür
|
620 |
+
if not session.stt_manager or not session.is_streaming:
|
621 |
+
log_warning(f"��️ STT not ready, attempting to restart", session_id=session.session.session_id)
|
622 |
+
await websocket.send_json({
|
623 |
+
"type": "error",
|
624 |
+
"error_type": "stt_not_ready",
|
625 |
+
"message": "STT is not ready. Waiting for initialization..."
|
626 |
+
})
|
627 |
return
|
628 |
|
629 |
# Add to buffer
|
|
|
635 |
# Check silence
|
636 |
silence_duration = session.silence_detector.update(decoded_audio)
|
637 |
|
638 |
+
# Stream to STT
|
639 |
+
try:
|
640 |
+
# Chunk counter artır
|
641 |
+
session.chunk_counter += 1
|
642 |
+
|
643 |
+
if session.chunk_counter == 1:
|
644 |
+
log_info(f"🎤 Started streaming audio to STT", session_id=session.session.session_id)
|
645 |
+
# İlk chunk'ta format kontrolü yap
|
646 |
+
if len(decoded_audio) >= 4:
|
647 |
+
if decoded_audio[:4] == b'\x1a\x45\xdf\xa3':
|
648 |
+
log_info(f"✅ Valid WEBM header detected", session_id=session.session.session_id)
|
649 |
+
else:
|
650 |
+
log_warning(f"⚠️ Unknown audio format, first 4 bytes: {decoded_audio[:4].hex()}", session_id=session.session.session_id)
|
651 |
+
elif session.chunk_counter % 100 == 0:
|
652 |
+
log_info(f"📊 Sent {session.chunk_counter} chunks to STT so far...", session_id=session.session.session_id)
|
653 |
+
|
654 |
+
# STT'ye gönder ve sonuçları bekle
|
655 |
+
async for result in session.stt_manager.stream_audio(decoded_audio):
|
656 |
+
# SADECE FINAL RESULT'LARI İŞLE
|
657 |
+
if result.is_final:
|
658 |
+
log_info(f"✅ FINAL TRANSCRIPTION: '{result.text}'", session_id=session.session.session_id)
|
659 |
+
|
660 |
+
# Send ONLY final transcription to frontend
|
661 |
await websocket.send_json({
|
662 |
+
"type": "transcription",
|
663 |
+
"text": result.text,
|
664 |
+
"is_final": True,
|
665 |
+
"confidence": result.confidence
|
666 |
})
|
667 |
+
|
668 |
+
session.current_transcription = result.text
|
669 |
+
|
670 |
+
# Final transcription geldiğinde STT'yi durdur ve işle
|
671 |
+
if session.current_transcription:
|
672 |
+
# Önce STT'yi durdur
|
673 |
+
await session.stop_stt_streaming()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
674 |
|
675 |
+
# State'i değiştir
|
676 |
+
await session.change_state(ConversationState.PROCESSING_STT)
|
677 |
await websocket.send_json({
|
678 |
+
"type": "state_change",
|
679 |
+
"from": "listening",
|
680 |
+
"to": "processing_stt"
|
|
|
681 |
})
|
682 |
|
683 |
+
# Process user input
|
684 |
+
await process_user_input(websocket, session)
|
685 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
686 |
|
687 |
+
except Exception as e:
|
688 |
+
error_msg = str(e)
|
689 |
+
# Google STT timeout hatası kontrolü
|
690 |
+
if "Audio Timeout Error" in error_msg or "stream duration" in error_msg or "Exceeded maximum allowed stream duration" in error_msg:
|
691 |
+
log_warning(f"⚠️ STT timeout detected, ignoring", session_id=session.session.session_id)
|
692 |
+
# Timeout durumunda STT'yi yeniden başlatmaya gerek yok,
|
693 |
+
# çünkü kullanıcı konuşmayı bitirdiğinde zaten yeniden başlatılacak
|
694 |
+
else:
|
695 |
+
log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
|
696 |
+
await websocket.send_json({
|
697 |
+
"type": "error",
|
698 |
+
"error_type": "stt_error",
|
699 |
+
"message": f"STT error: {str(e)}"
|
700 |
+
})
|
|
|
|
|
|
|
|
|
701 |
|
702 |
except Exception as e:
|
703 |
log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
|
|
|
706 |
"error_type": "audio_error",
|
707 |
"message": f"Audio processing error: {str(e)}"
|
708 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
709 |
|
710 |
# ========================= PROCESSING FUNCTIONS =========================
|
711 |
async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|