ciyidogan commited on
Commit
4e9384b
·
verified ·
1 Parent(s): 8a16b4e

Update websocket_handler.py

Browse files
Files changed (1) hide show
  1. websocket_handler.py +101 -181
websocket_handler.py CHANGED
@@ -405,18 +405,6 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
405
  # Initialize conversation
406
  realtime_session = RealtimeSession(session)
407
 
408
- # Initialize STT
409
- log_info(f"🎤 Initializing STT...", session_id=session_id)
410
- stt_initialized = await realtime_session.initialize_stt()
411
- if not stt_initialized:
412
- log_error(f"❌ STT initialization failed", session_id=session_id)
413
- await websocket.send_json({
414
- "type": "error",
415
- "message": "STT initialization failed"
416
- })
417
- else:
418
- log_info(f"✅ STT initialized", session_id=session_id)
419
-
420
  # Send session started confirmation
421
  await websocket.send_json({
422
  "type": "session_started",
@@ -495,14 +483,47 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
495
  log_info(f"✅ Welcome TTS sent", session_id=session_id)
496
  except Exception as e:
497
  log_error(f"❌ Failed to send welcome TTS", error=str(e), traceback=traceback.format_exc(), session_id=session_id)
 
 
 
 
 
 
 
 
 
498
  else:
499
  log_warning(f"⚠️ No TTS provider available", session_id=session_id)
 
 
 
 
 
 
 
 
500
 
501
  break
502
  else:
503
  log_warning(f"⚠️ No assistant message found in history", session_id=session_id)
 
 
 
 
 
 
 
 
504
  else:
505
  log_warning(f"⚠️ No messages in session history", session_id=session_id)
 
 
 
 
 
 
 
 
506
 
507
  log_info(f"💬 Ready for conversation", session_id=session_id)
508
 
@@ -584,30 +605,25 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
584
  log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
585
  return
586
 
587
- # Barge-in devre dışı - TTS/audio playback sırasında audio chunk'ları işleme
588
  if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS,
589
  ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]:
590
  log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id)
591
- # Audio buffer'ı da temizle ki eski chunk'lar birikmesin
592
- await session.audio_buffer.clear()
593
  return
594
 
595
- # Change state to listening if idle
596
- if session.state == ConversationState.IDLE:
597
- # IDLE'dan LISTENING'e geçerken buffer'ı temizle
598
- await session.audio_buffer.clear()
599
- await session.change_state(ConversationState.LISTENING)
600
- await websocket.send_json({
601
- "type": "state_change",
602
- "from": "idle",
603
- "to": "listening"
604
- })
605
- # IDLE'dan LISTENING'e geçerken STT'yi başlat
606
- if not session.is_streaming:
607
- await session.restart_stt_if_needed()
608
-
609
  # LISTENING state'inde değilse audio işleme
610
  if session.state != ConversationState.LISTENING:
 
 
 
 
 
 
 
 
 
 
 
611
  return
612
 
613
  # Add to buffer
@@ -619,92 +635,69 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
619
  # Check silence
620
  silence_duration = session.silence_detector.update(decoded_audio)
621
 
622
- # Stream to STT if available and in LISTENING state
623
- if session.stt_manager and session.state == ConversationState.LISTENING:
624
- # Ensure streaming is active
625
- if not session.is_streaming:
626
- log_warning(f"⚠️ STT not streaming, attempting to restart", session_id=session.session.session_id)
627
- restart_success = await session.restart_stt_if_needed()
628
- if not restart_success:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
  await websocket.send_json({
630
- "type": "error",
631
- "error_type": "stt_error",
632
- "message": "STT streaming not available"
 
633
  })
634
- return
635
-
636
- try:
637
- # Chunk counter artır
638
- session.chunk_counter += 1
639
-
640
- if session.chunk_counter == 1:
641
- log_info(f"🎤 Started streaming audio to STT", session_id=session.session.session_id)
642
- # İlk chunk'ta format kontrolü yap
643
- if len(decoded_audio) >= 4:
644
- if decoded_audio[:4] == b'\x1a\x45\xdf\xa3':
645
- log_info(f"✅ Valid WEBM header detected", session_id=session.session.session_id)
646
- else:
647
- log_warning(f"⚠️ Unknown audio format, first 4 bytes: {decoded_audio[:4].hex()}", session_id=session.session.session_id)
648
- # Format hatalıysa buffer'ı temizle ve chunk counter'ı resetle
649
- await session.audio_buffer.clear()
650
- session.chunk_counter = 0
651
- await session.stop_stt_streaming()
652
- await session.restart_stt_if_needed()
653
- return
654
- elif session.chunk_counter % 100 == 0:
655
- log_info(f"📊 Sent {session.chunk_counter} chunks to STT so far...", session_id=session.session.session_id)
656
-
657
- # STT'ye gönder ve sonuçları bekle
658
- async for result in session.stt_manager.stream_audio(decoded_audio):
659
- # SADECE FINAL RESULT'LARI İŞLE
660
- if result.is_final:
661
- log_info(f"✅ FINAL TRANSCRIPTION: '{result.text}'", session_id=session.session.session_id)
662
 
663
- # Send ONLY final transcription to frontend
 
664
  await websocket.send_json({
665
- "type": "transcription",
666
- "text": result.text,
667
- "is_final": True,
668
- "confidence": result.confidence
669
  })
670
 
671
- session.current_transcription = result.text
672
-
673
- # Final transcription geldiğinde STT'yi durdur ve işle
674
- if session.current_transcription:
675
- # Önce STT'yi durdur
676
- await session.stop_stt_streaming()
677
-
678
- # State'i değiştir
679
- await session.change_state(ConversationState.PROCESSING_STT)
680
- await websocket.send_json({
681
- "type": "state_change",
682
- "from": "listening",
683
- "to": "processing_stt"
684
- })
685
-
686
- # Process user input
687
- await process_user_input(websocket, session)
688
- return
689
 
690
- except Exception as e:
691
- error_msg = str(e)
692
- # Google STT timeout hatası kontrolü
693
- if "Audio Timeout Error" in error_msg or "stream duration" in error_msg or "Exceeded maximum allowed stream duration" in error_msg:
694
- log_warning(f"⚠️ STT timeout detected, restarting stream", session_id=session.session.session_id)
695
- session.is_streaming = False
696
- session.chunk_counter = 0
697
- # Buffer'ı temizle
698
- await session.audio_buffer.clear()
699
- # Timeout durumunda yeniden başlat
700
- await session.restart_stt_if_needed()
701
- else:
702
- log_error(f"STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
703
- await websocket.send_json({
704
- "type": "error",
705
- "error_type": "stt_error",
706
- "message": f"STT error: {str(e)}"
707
- })
708
 
709
  except Exception as e:
710
  log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
@@ -713,79 +706,6 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
713
  "error_type": "audio_error",
714
  "message": f"Audio processing error: {str(e)}"
715
  })
716
-
717
- async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
718
- """Handle control messages"""
719
- action = message.get("action")
720
- config = message.get("config", {})
721
-
722
- log_debug(f"🎮 Control message", action=action, session_id=session.session.session_id)
723
-
724
- if action == "start_session":
725
- # Session configuration
726
- await websocket.send_json({
727
- "type": "session_config",
728
- "session_id": session.session.session_id,
729
- "config": {
730
- "silence_threshold_ms": session.silence_threshold_ms,
731
- "audio_chunk_size": session.audio_chunk_size,
732
- "supports_barge_in": False # Barge-in devre dışı
733
- }
734
- })
735
-
736
- elif action == "end_session" or action == "stop_session":
737
- # Clean up and close
738
- await session.cleanup()
739
- await websocket.close()
740
-
741
- elif action == "interrupt":
742
- # Barge-in devre dışı - ignore
743
- log_warning(f"⚠️ Interrupt request ignored (barge-in disabled)", session_id=session.session.session_id)
744
-
745
- elif action == "reset":
746
- # Reset conversation state
747
- await session.reset_for_new_utterance()
748
- await session.stop_stt_streaming()
749
- await session.change_state(ConversationState.IDLE)
750
- await websocket.send_json({
751
- "type": "state_change",
752
- "from": session.state.value,
753
- "to": "idle"
754
- })
755
-
756
- elif action == "audio_ended":
757
- # Audio playback ended on client
758
- if session.state == ConversationState.PLAYING_AUDIO:
759
- log_info(f"🎵 Client reported audio ended", session_id=session.session.session_id)
760
- await session.change_state(ConversationState.LISTENING)
761
- await websocket.send_json({
762
- "type": "state_change",
763
- "from": "playing_audio",
764
- "to": "listening"
765
- })
766
- # STT'yi yeniden başlat
767
- success = await session.restart_stt_if_needed()
768
-
769
- # STT hazır olduğunda sinyal gönder
770
- if success and session.is_streaming:
771
- log_info(f"✅ Sending STT ready signal", session_id=session.session.session_id)
772
- await websocket.send_json({
773
- "type": "stt_ready",
774
- "message": "STT is ready to receive audio"
775
- })
776
- else:
777
- log_error(f"❌ STT not ready after restart", session_id=session.session.session_id)
778
- await websocket.send_json({
779
- "type": "error",
780
- "error_type": "stt_init_failed",
781
- "message": "Failed to initialize STT after audio playback"
782
- })
783
-
784
- elif action == "restart_stt":
785
- # Manual STT restart request
786
- log_info(f"🔄 Manual STT restart requested", session_id=session.session.session_id)
787
- await session.stop_stt_streaming()
788
- await session.restart_stt_if_needed()
789
 
790
  # ========================= PROCESSING FUNCTIONS =========================
791
  async def process_user_input(websocket: WebSocket, session: RealtimeSession):
 
405
  # Initialize conversation
406
  realtime_session = RealtimeSession(session)
407
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  # Send session started confirmation
409
  await websocket.send_json({
410
  "type": "session_started",
 
483
  log_info(f"✅ Welcome TTS sent", session_id=session_id)
484
  except Exception as e:
485
  log_error(f"❌ Failed to send welcome TTS", error=str(e), traceback=traceback.format_exc(), session_id=session_id)
486
+ # TTS hatası durumunda direkt listening moduna geç
487
+ await realtime_session.change_state(ConversationState.LISTENING)
488
+ await websocket.send_json({
489
+ "type": "state_change",
490
+ "from": "playing_audio",
491
+ "to": "listening"
492
+ })
493
+ # Ve STT'yi başlat
494
+ await realtime_session.initialize_stt()
495
  else:
496
  log_warning(f"⚠️ No TTS provider available", session_id=session_id)
497
+ # TTS yoksa direkt listening moduna geç ve STT başlat
498
+ await realtime_session.change_state(ConversationState.LISTENING)
499
+ await websocket.send_json({
500
+ "type": "state_change",
501
+ "from": "idle",
502
+ "to": "listening"
503
+ })
504
+ await realtime_session.initialize_stt()
505
 
506
  break
507
  else:
508
  log_warning(f"⚠️ No assistant message found in history", session_id=session_id)
509
+ # Welcome mesajı yoksa direkt listening moduna geç
510
+ await realtime_session.change_state(ConversationState.LISTENING)
511
+ await websocket.send_json({
512
+ "type": "state_change",
513
+ "from": "idle",
514
+ "to": "listening"
515
+ })
516
+ await realtime_session.initialize_stt()
517
  else:
518
  log_warning(f"⚠️ No messages in session history", session_id=session_id)
519
+ # History yoksa direkt listening moduna geç
520
+ await realtime_session.change_state(ConversationState.LISTENING)
521
+ await websocket.send_json({
522
+ "type": "state_change",
523
+ "from": "idle",
524
+ "to": "listening"
525
+ })
526
+ await realtime_session.initialize_stt()
527
 
528
  log_info(f"💬 Ready for conversation", session_id=session_id)
529
 
 
605
  log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
606
  return
607
 
608
+ # TTS/LLM işlenirken audio chunk'ları tamamen yoksay
609
  if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS,
610
  ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]:
611
  log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id)
 
 
612
  return
613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
  # LISTENING state'inde değilse audio işleme
615
  if session.state != ConversationState.LISTENING:
616
+ log_warning(f"⚠️ Audio received in unexpected state: {session.state.value}", session_id=session.session.session_id)
617
+ return
618
+
619
+ # STT yoksa veya streaming değilse hata döndür
620
+ if not session.stt_manager or not session.is_streaming:
621
+ log_warning(f"��️ STT not ready, attempting to restart", session_id=session.session.session_id)
622
+ await websocket.send_json({
623
+ "type": "error",
624
+ "error_type": "stt_not_ready",
625
+ "message": "STT is not ready. Waiting for initialization..."
626
+ })
627
  return
628
 
629
  # Add to buffer
 
635
  # Check silence
636
  silence_duration = session.silence_detector.update(decoded_audio)
637
 
638
+ # Stream to STT
639
+ try:
640
+ # Chunk counter artır
641
+ session.chunk_counter += 1
642
+
643
+ if session.chunk_counter == 1:
644
+ log_info(f"🎤 Started streaming audio to STT", session_id=session.session.session_id)
645
+ # İlk chunk'ta format kontrolü yap
646
+ if len(decoded_audio) >= 4:
647
+ if decoded_audio[:4] == b'\x1a\x45\xdf\xa3':
648
+ log_info(f"✅ Valid WEBM header detected", session_id=session.session.session_id)
649
+ else:
650
+ log_warning(f"⚠️ Unknown audio format, first 4 bytes: {decoded_audio[:4].hex()}", session_id=session.session.session_id)
651
+ elif session.chunk_counter % 100 == 0:
652
+ log_info(f"📊 Sent {session.chunk_counter} chunks to STT so far...", session_id=session.session.session_id)
653
+
654
+ # STT'ye gönder ve sonuçları bekle
655
+ async for result in session.stt_manager.stream_audio(decoded_audio):
656
+ # SADECE FINAL RESULT'LARI İŞLE
657
+ if result.is_final:
658
+ log_info(f"✅ FINAL TRANSCRIPTION: '{result.text}'", session_id=session.session.session_id)
659
+
660
+ # Send ONLY final transcription to frontend
661
  await websocket.send_json({
662
+ "type": "transcription",
663
+ "text": result.text,
664
+ "is_final": True,
665
+ "confidence": result.confidence
666
  })
667
+
668
+ session.current_transcription = result.text
669
+
670
+ # Final transcription geldiğinde STT'yi durdur ve işle
671
+ if session.current_transcription:
672
+ # Önce STT'yi durdur
673
+ await session.stop_stt_streaming()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
 
675
+ # State'i değiştir
676
+ await session.change_state(ConversationState.PROCESSING_STT)
677
  await websocket.send_json({
678
+ "type": "state_change",
679
+ "from": "listening",
680
+ "to": "processing_stt"
 
681
  })
682
 
683
+ # Process user input
684
+ await process_user_input(websocket, session)
685
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
 
687
+ except Exception as e:
688
+ error_msg = str(e)
689
+ # Google STT timeout hatası kontrolü
690
+ if "Audio Timeout Error" in error_msg or "stream duration" in error_msg or "Exceeded maximum allowed stream duration" in error_msg:
691
+ log_warning(f"⚠️ STT timeout detected, ignoring", session_id=session.session.session_id)
692
+ # Timeout durumunda STT'yi yeniden başlatmaya gerek yok,
693
+ # çünkü kullanıcı konuşmayı bitirdiğinde zaten yeniden başlatılacak
694
+ else:
695
+ log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
696
+ await websocket.send_json({
697
+ "type": "error",
698
+ "error_type": "stt_error",
699
+ "message": f"STT error: {str(e)}"
700
+ })
 
 
 
 
701
 
702
  except Exception as e:
703
  log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
 
706
  "error_type": "audio_error",
707
  "message": f"Audio processing error: {str(e)}"
708
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
 
710
  # ========================= PROCESSING FUNCTIONS =========================
711
  async def process_user_input(websocket: WebSocket, session: RealtimeSession):