Spaces:
Building
Building
Update websocket_handler.py
Browse files- websocket_handler.py +134 -119
websocket_handler.py
CHANGED
@@ -153,7 +153,7 @@ class RealtimeSession:
|
|
153 |
def __init__(self, session: Session):
|
154 |
self.session = session
|
155 |
self.state = ConversationState.IDLE
|
156 |
-
self.is_websocket_active = True
|
157 |
|
158 |
# Get settings from config
|
159 |
config = ConfigProvider.get().global_config.stt_provider.settings
|
@@ -178,9 +178,15 @@ class RealtimeSession:
|
|
178 |
self.audio_chunk_size = config.get("audio_chunk_size", DEFAULT_AUDIO_CHUNK_SIZE)
|
179 |
self.silence_threshold_ms = silence_threshold
|
180 |
|
|
|
|
|
|
|
181 |
async def initialize_stt(self):
|
182 |
"""Initialize STT provider"""
|
183 |
try:
|
|
|
|
|
|
|
184 |
self.stt_manager = STTFactory.create_provider()
|
185 |
if not self.stt_manager:
|
186 |
log_error("❌ STT manager is None - STTFactory.create_provider() returned None", session_id=self.session.session_id)
|
@@ -192,7 +198,7 @@ class RealtimeSession:
|
|
192 |
config = ConfigProvider.get().global_config.stt_provider.settings
|
193 |
|
194 |
# Get language from session locale
|
195 |
-
session_locale = getattr(self.session, 'locale', 'tr')
|
196 |
|
197 |
# Import LocaleManager to get proper locale tag
|
198 |
from locale_manager import LocaleManager
|
@@ -225,31 +231,44 @@ class RealtimeSession:
|
|
225 |
log_error(f"❌ Failed to initialize STT", error=str(e), traceback=traceback.format_exc(), session_id=self.session.session_id)
|
226 |
self.stt_manager = None
|
227 |
self.is_streaming = False
|
|
|
228 |
return False
|
229 |
|
230 |
async def restart_stt_if_needed(self):
|
231 |
-
"""Restart STT if it's not active
|
232 |
try:
|
|
|
233 |
if not self.is_streaming and self.is_websocket_active and self.state == ConversationState.LISTENING:
|
234 |
-
log_info(f"🔄 Restarting STT stream
|
|
|
|
|
|
|
235 |
|
236 |
-
#
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
self.chunk_counter = 0
|
245 |
-
return True
|
246 |
-
else:
|
247 |
-
log_error(f"❌ Failed to restart STT stream", session_id=self.session.session_id)
|
248 |
-
return False
|
249 |
return True
|
250 |
except Exception as e:
|
251 |
log_error(f"❌ Error restarting STT", error=str(e), session_id=self.session.session_id)
|
252 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
async def change_state(self, new_state: ConversationState):
|
255 |
"""Change conversation state"""
|
@@ -263,23 +282,25 @@ class RealtimeSession:
|
|
263 |
|
264 |
async def handle_barge_in(self):
|
265 |
"""Handle user interruption"""
|
266 |
-
|
267 |
-
|
|
|
268 |
|
269 |
async def reset_for_new_utterance(self):
|
270 |
"""Reset for new user utterance"""
|
271 |
await self.audio_buffer.clear()
|
272 |
self.silence_detector.reset()
|
273 |
self.current_transcription = ""
|
|
|
274 |
if hasattr(self, 'speech_started'):
|
275 |
-
delattr(self, 'speech_started')
|
|
|
276 |
|
277 |
async def cleanup(self):
|
278 |
"""Clean up resources"""
|
279 |
try:
|
280 |
-
self.is_websocket_active = False
|
281 |
-
|
282 |
-
await self.stt_manager.stop_streaming()
|
283 |
log_info(f"Cleaned up realtime session", session_id=self.session.session_id)
|
284 |
except Exception as e:
|
285 |
log_warning(f"Cleanup error", error=str(e), session_id=self.session.session_id)
|
@@ -480,7 +501,7 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
|
480 |
|
481 |
# ========================= MESSAGE HANDLERS =========================
|
482 |
async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
483 |
-
"""Handle incoming audio chunk with
|
484 |
try:
|
485 |
# WebSocket kapandıysa işlem yapma
|
486 |
if not session.is_websocket_active:
|
@@ -491,14 +512,11 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
491 |
log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
|
492 |
return
|
493 |
|
494 |
-
#
|
495 |
-
if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
"action": "stop_playback"
|
500 |
-
})
|
501 |
-
log_info(f"🛑 Barge-in detected", session_id=session.session.session_id, state=session.state.value)
|
502 |
|
503 |
# Change state to listening if idle
|
504 |
if session.state == ConversationState.IDLE:
|
@@ -508,7 +526,14 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
508 |
"from": "idle",
|
509 |
"to": "listening"
|
510 |
})
|
|
|
|
|
|
|
511 |
|
|
|
|
|
|
|
|
|
512 |
# Add to buffer
|
513 |
await session.audio_buffer.add_chunk(audio_data)
|
514 |
|
@@ -518,14 +543,13 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
518 |
# Check silence
|
519 |
silence_duration = session.silence_detector.update(decoded_audio)
|
520 |
|
521 |
-
# Stream to STT if available
|
522 |
if session.stt_manager and session.state == ConversationState.LISTENING:
|
523 |
# Ensure streaming is active
|
524 |
if not session.is_streaming:
|
525 |
-
log_warning(f"⚠️ STT
|
526 |
-
|
527 |
-
|
528 |
-
if not stt_initialized:
|
529 |
await websocket.send_json({
|
530 |
"type": "error",
|
531 |
"error_type": "stt_error",
|
@@ -534,9 +558,7 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
534 |
return
|
535 |
|
536 |
try:
|
537 |
-
# Chunk counter
|
538 |
-
if not hasattr(session, 'chunk_counter'):
|
539 |
-
session.chunk_counter = 0
|
540 |
session.chunk_counter += 1
|
541 |
|
542 |
if session.chunk_counter == 1:
|
@@ -560,9 +582,12 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
560 |
|
561 |
session.current_transcription = result.text
|
562 |
|
563 |
-
# Final transcription geldiğinde
|
564 |
if session.current_transcription:
|
565 |
-
#
|
|
|
|
|
|
|
566 |
await session.change_state(ConversationState.PROCESSING_STT)
|
567 |
await websocket.send_json({
|
568 |
"type": "state_change",
|
@@ -576,31 +601,31 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
|
|
576 |
# Reset for new utterance
|
577 |
await session.reset_for_new_utterance()
|
578 |
return
|
579 |
-
|
580 |
-
# Interim result'ları artık göndermiyoruz ve loglama yapmıyoruz
|
581 |
|
582 |
except Exception as e:
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
"
|
587 |
-
|
588 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
589 |
|
590 |
except Exception as e:
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
else:
|
598 |
-
log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
|
599 |
-
await websocket.send_json({
|
600 |
-
"type": "error",
|
601 |
-
"error_type": "stt_error",
|
602 |
-
"message": f"STT error: {str(e)}"
|
603 |
-
})
|
604 |
|
605 |
async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
606 |
"""Handle control messages"""
|
@@ -617,7 +642,7 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
|
|
617 |
"config": {
|
618 |
"silence_threshold_ms": session.silence_threshold_ms,
|
619 |
"audio_chunk_size": session.audio_chunk_size,
|
620 |
-
"supports_barge_in":
|
621 |
}
|
622 |
})
|
623 |
|
@@ -627,16 +652,13 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
|
|
627 |
await websocket.close()
|
628 |
|
629 |
elif action == "interrupt":
|
630 |
-
#
|
631 |
-
|
632 |
-
await websocket.send_json({
|
633 |
-
"type": "control",
|
634 |
-
"action": "interrupt_acknowledged"
|
635 |
-
})
|
636 |
|
637 |
elif action == "reset":
|
638 |
# Reset conversation state
|
639 |
await session.reset_for_new_utterance()
|
|
|
640 |
await session.change_state(ConversationState.IDLE)
|
641 |
await websocket.send_json({
|
642 |
"type": "state_change",
|
@@ -647,6 +669,7 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
|
|
647 |
elif action == "audio_ended":
|
648 |
# Audio playback ended on client
|
649 |
if session.state == ConversationState.PLAYING_AUDIO:
|
|
|
650 |
await session.change_state(ConversationState.LISTENING)
|
651 |
await websocket.send_json({
|
652 |
"type": "state_change",
|
@@ -655,21 +678,19 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
|
|
655 |
})
|
656 |
# STT'yi yeniden başlat
|
657 |
await session.restart_stt_if_needed()
|
658 |
-
|
|
|
|
|
|
|
|
|
|
|
659 |
|
660 |
# ========================= PROCESSING FUNCTIONS =========================
|
661 |
async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
662 |
"""Process complete user input"""
|
663 |
try:
|
664 |
-
# LLM işlemesi
|
665 |
-
|
666 |
-
log_info(f"⏸️ Pausing STT during LLM processing", session_id=session.session.session_id)
|
667 |
-
try:
|
668 |
-
await session.stt_manager.stop_streaming()
|
669 |
-
session.is_streaming = False
|
670 |
-
except Exception as e:
|
671 |
-
log_warning(f"⚠️ Error stopping STT: {e}", session_id=session.session.session_id)
|
672 |
-
session.is_streaming = False
|
673 |
|
674 |
# WebSocket aktif mi kontrol et
|
675 |
if not session.is_websocket_active:
|
@@ -677,9 +698,10 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
|
677 |
|
678 |
user_text = session.current_transcription
|
679 |
if not user_text:
|
680 |
-
log_warning(f"⚠️ Empty transcription,
|
681 |
-
# Boş transcription
|
682 |
await session.change_state(ConversationState.LISTENING)
|
|
|
683 |
return
|
684 |
|
685 |
log_info(f"🎯 Processing user input", text=user_text, session_id=session.session.session_id)
|
@@ -740,18 +762,25 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
|
740 |
|
741 |
log_info(f"🎵 Starting TTS generation for response", session_id=session.session.session_id)
|
742 |
|
743 |
-
# Generate TTS
|
744 |
-
|
745 |
-
|
746 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
747 |
|
748 |
-
try:
|
749 |
-
await tts_task
|
750 |
-
except asyncio.CancelledError:
|
751 |
-
log_info("⚡ TTS cancelled due to barge-in", session_id=session.session.session_id)
|
752 |
else:
|
753 |
log_info(f"⚠️ No TTS provider or WebSocket inactive", session_id=session.session.session_id)
|
754 |
-
# No TTS, go back to listening
|
755 |
await session.change_state(ConversationState.LISTENING)
|
756 |
if session.is_websocket_active:
|
757 |
await websocket.send_json({
|
@@ -759,6 +788,7 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
|
759 |
"from": "processing_llm",
|
760 |
"to": "listening"
|
761 |
})
|
|
|
762 |
|
763 |
except Exception as e:
|
764 |
log_error(
|
@@ -773,8 +803,9 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
|
773 |
"message": f"Processing error: {str(e)}"
|
774 |
})
|
775 |
await session.reset_for_new_utterance()
|
776 |
-
# Hata durumunda
|
777 |
await session.change_state(ConversationState.LISTENING)
|
|
|
778 |
|
779 |
async def generate_and_stream_tts(
|
780 |
websocket: WebSocket,
|
@@ -782,17 +813,10 @@ async def generate_and_stream_tts(
|
|
782 |
tts_provider,
|
783 |
text: str
|
784 |
):
|
785 |
-
"""Generate and stream TTS audio with
|
786 |
try:
|
787 |
-
# TTS başlamadan önce STT'
|
788 |
-
|
789 |
-
log_info(f"⏸️ Pausing STT stream during TTS", session_id=session.session.session_id)
|
790 |
-
try:
|
791 |
-
await session.stt_manager.stop_streaming()
|
792 |
-
session.is_streaming = False
|
793 |
-
except Exception as e:
|
794 |
-
log_warning(f"⚠️ Error stopping STT before TTS: {e}", session_id=session.session.session_id)
|
795 |
-
session.is_streaming = False
|
796 |
|
797 |
log_info(f"🎤 Starting TTS generation for text: '{text[:50]}...'", session_id=session.session.session_id)
|
798 |
|
@@ -841,11 +865,6 @@ async def generate_and_stream_tts(
|
|
841 |
log_info(f"📤 Streaming TTS audio: {len(audio_data)} bytes as {total_length} base64 chars in {total_chunks} chunks", session_id=session.session.session_id)
|
842 |
|
843 |
for i in range(0, total_length, chunk_size):
|
844 |
-
# Check for cancellation
|
845 |
-
if asyncio.current_task().cancelled():
|
846 |
-
log_info(f"⚡ Streaming cancelled at chunk {i//chunk_size}", session_id=session.session.session_id)
|
847 |
-
break
|
848 |
-
|
849 |
# WebSocket aktif mi kontrol et
|
850 |
if not session.is_websocket_active:
|
851 |
log_warning(f"⚠️ WebSocket inactive during streaming, stopping", session_id=session.session.session_id)
|
@@ -877,15 +896,9 @@ async def generate_and_stream_tts(
|
|
877 |
audio_size=len(audio_data),
|
878 |
chunks_sent=total_chunks
|
879 |
)
|
880 |
-
|
881 |
-
# TTS bitiminde STT'yi yeniden başlat
|
882 |
-
if session.state == ConversationState.LISTENING:
|
883 |
-
log_info(f"🔄 Restarting STT after TTS completion", session_id=session.session.session_id)
|
884 |
-
await session.restart_stt_if_needed()
|
885 |
|
886 |
-
|
887 |
-
|
888 |
-
raise
|
889 |
except Exception as e:
|
890 |
error_msg = str(e)
|
891 |
log_error(
|
@@ -910,11 +923,13 @@ async def generate_and_stream_tts(
|
|
910 |
"message": f"TTS error: {error_msg}"
|
911 |
})
|
912 |
|
913 |
-
# TTS hatası durumunda
|
914 |
-
await session.change_state(ConversationState.
|
915 |
if session.is_websocket_active:
|
916 |
await websocket.send_json({
|
917 |
"type": "state_change",
|
918 |
"from": "processing_tts",
|
919 |
-
"to": "
|
920 |
-
})
|
|
|
|
|
|
153 |
def __init__(self, session: Session):
|
154 |
self.session = session
|
155 |
self.state = ConversationState.IDLE
|
156 |
+
self.is_websocket_active = True
|
157 |
|
158 |
# Get settings from config
|
159 |
config = ConfigProvider.get().global_config.stt_provider.settings
|
|
|
178 |
self.audio_chunk_size = config.get("audio_chunk_size", DEFAULT_AUDIO_CHUNK_SIZE)
|
179 |
self.silence_threshold_ms = silence_threshold
|
180 |
|
181 |
+
# Chunk counter için attribute
|
182 |
+
self.chunk_counter = 0
|
183 |
+
|
184 |
async def initialize_stt(self):
|
185 |
"""Initialize STT provider"""
|
186 |
try:
|
187 |
+
# Her başlatmada chunk counter'ı sıfırla
|
188 |
+
self.chunk_counter = 0
|
189 |
+
|
190 |
self.stt_manager = STTFactory.create_provider()
|
191 |
if not self.stt_manager:
|
192 |
log_error("❌ STT manager is None - STTFactory.create_provider() returned None", session_id=self.session.session_id)
|
|
|
198 |
config = ConfigProvider.get().global_config.stt_provider.settings
|
199 |
|
200 |
# Get language from session locale
|
201 |
+
session_locale = getattr(self.session, 'locale', 'tr')
|
202 |
|
203 |
# Import LocaleManager to get proper locale tag
|
204 |
from locale_manager import LocaleManager
|
|
|
231 |
log_error(f"❌ Failed to initialize STT", error=str(e), traceback=traceback.format_exc(), session_id=self.session.session_id)
|
232 |
self.stt_manager = None
|
233 |
self.is_streaming = False
|
234 |
+
self.chunk_counter = 0
|
235 |
return False
|
236 |
|
237 |
async def restart_stt_if_needed(self):
|
238 |
+
"""Restart STT if it's not active"""
|
239 |
try:
|
240 |
+
# Sadece LISTENING state'inde ve WebSocket aktifse restart yap
|
241 |
if not self.is_streaming and self.is_websocket_active and self.state == ConversationState.LISTENING:
|
242 |
+
log_info(f"🔄 Restarting STT stream...", session_id=self.session.session_id)
|
243 |
+
|
244 |
+
# Önce mevcut stream'i temizle
|
245 |
+
await self.stop_stt_streaming()
|
246 |
|
247 |
+
# Sonra yeniden başlat
|
248 |
+
stt_initialized = await self.initialize_stt()
|
249 |
+
if stt_initialized:
|
250 |
+
log_info(f"✅ STT stream restarted successfully", session_id=self.session.session_id)
|
251 |
+
return True
|
252 |
+
else:
|
253 |
+
log_error(f"❌ Failed to restart STT stream", session_id=self.session.session_id)
|
254 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
255 |
return True
|
256 |
except Exception as e:
|
257 |
log_error(f"❌ Error restarting STT", error=str(e), session_id=self.session.session_id)
|
258 |
return False
|
259 |
+
|
260 |
+
async def stop_stt_streaming(self):
|
261 |
+
"""Stop STT streaming completely"""
|
262 |
+
try:
|
263 |
+
if self.stt_manager and self.is_streaming:
|
264 |
+
log_info(f"🛑 Stopping STT stream", session_id=self.session.session_id)
|
265 |
+
await self.stt_manager.stop_streaming()
|
266 |
+
self.is_streaming = False
|
267 |
+
self.chunk_counter = 0
|
268 |
+
log_info(f"✅ STT stream stopped", session_id=self.session.session_id)
|
269 |
+
except Exception as e:
|
270 |
+
log_warning(f"⚠️ Error stopping STT stream: {e}", session_id=self.session.session_id)
|
271 |
+
self.is_streaming = False
|
272 |
|
273 |
async def change_state(self, new_state: ConversationState):
|
274 |
"""Change conversation state"""
|
|
|
282 |
|
283 |
async def handle_barge_in(self):
|
284 |
"""Handle user interruption"""
|
285 |
+
# Barge-in devre dışı - bu metod artık çağrılmamalı
|
286 |
+
log_warning(f"⚠️ Barge-in called but disabled", session_id=self.session.session_id)
|
287 |
+
return
|
288 |
|
289 |
async def reset_for_new_utterance(self):
|
290 |
"""Reset for new user utterance"""
|
291 |
await self.audio_buffer.clear()
|
292 |
self.silence_detector.reset()
|
293 |
self.current_transcription = ""
|
294 |
+
self.chunk_counter = 0 # Chunk counter'ı reset et
|
295 |
if hasattr(self, 'speech_started'):
|
296 |
+
delattr(self, 'speech_started')
|
297 |
+
log_info(f"🔄 Reset for new utterance complete", session_id=self.session.session_id)
|
298 |
|
299 |
async def cleanup(self):
|
300 |
"""Clean up resources"""
|
301 |
try:
|
302 |
+
self.is_websocket_active = False
|
303 |
+
await self.stop_stt_streaming() # STT'yi düzgün durdur
|
|
|
304 |
log_info(f"Cleaned up realtime session", session_id=self.session.session_id)
|
305 |
except Exception as e:
|
306 |
log_warning(f"Cleanup error", error=str(e), session_id=self.session.session_id)
|
|
|
501 |
|
502 |
# ========================= MESSAGE HANDLERS =========================
|
503 |
async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
504 |
+
"""Handle incoming audio chunk with sequential processing"""
|
505 |
try:
|
506 |
# WebSocket kapandıysa işlem yapma
|
507 |
if not session.is_websocket_active:
|
|
|
512 |
log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
|
513 |
return
|
514 |
|
515 |
+
# Barge-in devre dışı - TTS/audio playback sırasında audio chunk'ları işleme
|
516 |
+
if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS,
|
517 |
+
ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]:
|
518 |
+
log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id)
|
519 |
+
return
|
|
|
|
|
|
|
520 |
|
521 |
# Change state to listening if idle
|
522 |
if session.state == ConversationState.IDLE:
|
|
|
526 |
"from": "idle",
|
527 |
"to": "listening"
|
528 |
})
|
529 |
+
# IDLE'dan LISTENING'e geçerken STT'yi başlat
|
530 |
+
if not session.is_streaming:
|
531 |
+
await session.restart_stt_if_needed()
|
532 |
|
533 |
+
# LISTENING state'inde değilse audio işleme
|
534 |
+
if session.state != ConversationState.LISTENING:
|
535 |
+
return
|
536 |
+
|
537 |
# Add to buffer
|
538 |
await session.audio_buffer.add_chunk(audio_data)
|
539 |
|
|
|
543 |
# Check silence
|
544 |
silence_duration = session.silence_detector.update(decoded_audio)
|
545 |
|
546 |
+
# Stream to STT if available and in LISTENING state
|
547 |
if session.stt_manager and session.state == ConversationState.LISTENING:
|
548 |
# Ensure streaming is active
|
549 |
if not session.is_streaming:
|
550 |
+
log_warning(f"⚠️ STT not streaming, attempting to restart", session_id=session.session.session_id)
|
551 |
+
restart_success = await session.restart_stt_if_needed()
|
552 |
+
if not restart_success:
|
|
|
553 |
await websocket.send_json({
|
554 |
"type": "error",
|
555 |
"error_type": "stt_error",
|
|
|
558 |
return
|
559 |
|
560 |
try:
|
561 |
+
# Chunk counter artır
|
|
|
|
|
562 |
session.chunk_counter += 1
|
563 |
|
564 |
if session.chunk_counter == 1:
|
|
|
582 |
|
583 |
session.current_transcription = result.text
|
584 |
|
585 |
+
# Final transcription geldiğinde STT'yi durdur ve işle
|
586 |
if session.current_transcription:
|
587 |
+
# Önce STT'yi durdur
|
588 |
+
await session.stop_stt_streaming()
|
589 |
+
|
590 |
+
# State'i değiştir
|
591 |
await session.change_state(ConversationState.PROCESSING_STT)
|
592 |
await websocket.send_json({
|
593 |
"type": "state_change",
|
|
|
601 |
# Reset for new utterance
|
602 |
await session.reset_for_new_utterance()
|
603 |
return
|
|
|
|
|
604 |
|
605 |
except Exception as e:
|
606 |
+
error_msg = str(e)
|
607 |
+
# Google STT timeout hatası kontrolü
|
608 |
+
if "Audio Timeout Error" in error_msg or "stream duration" in error_msg or "Exceeded maximum allowed stream duration" in error_msg:
|
609 |
+
log_warning(f"⚠️ STT timeout detected, restarting stream", session_id=session.session.session_id)
|
610 |
+
session.is_streaming = False
|
611 |
+
session.chunk_counter = 0
|
612 |
+
# Timeout durumunda yeniden başlat
|
613 |
+
await session.restart_stt_if_needed()
|
614 |
+
else:
|
615 |
+
log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
|
616 |
+
await websocket.send_json({
|
617 |
+
"type": "error",
|
618 |
+
"error_type": "stt_error",
|
619 |
+
"message": f"STT error: {str(e)}"
|
620 |
+
})
|
621 |
|
622 |
except Exception as e:
|
623 |
+
log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
|
624 |
+
await websocket.send_json({
|
625 |
+
"type": "error",
|
626 |
+
"error_type": "audio_error",
|
627 |
+
"message": f"Audio processing error: {str(e)}"
|
628 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
629 |
|
630 |
async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
631 |
"""Handle control messages"""
|
|
|
642 |
"config": {
|
643 |
"silence_threshold_ms": session.silence_threshold_ms,
|
644 |
"audio_chunk_size": session.audio_chunk_size,
|
645 |
+
"supports_barge_in": False # Barge-in devre dışı
|
646 |
}
|
647 |
})
|
648 |
|
|
|
652 |
await websocket.close()
|
653 |
|
654 |
elif action == "interrupt":
|
655 |
+
# Barge-in devre dışı - ignore
|
656 |
+
log_warning(f"⚠️ Interrupt request ignored (barge-in disabled)", session_id=session.session.session_id)
|
|
|
|
|
|
|
|
|
657 |
|
658 |
elif action == "reset":
|
659 |
# Reset conversation state
|
660 |
await session.reset_for_new_utterance()
|
661 |
+
await session.stop_stt_streaming()
|
662 |
await session.change_state(ConversationState.IDLE)
|
663 |
await websocket.send_json({
|
664 |
"type": "state_change",
|
|
|
669 |
elif action == "audio_ended":
|
670 |
# Audio playback ended on client
|
671 |
if session.state == ConversationState.PLAYING_AUDIO:
|
672 |
+
log_info(f"🎵 Client reported audio ended", session_id=session.session.session_id)
|
673 |
await session.change_state(ConversationState.LISTENING)
|
674 |
await websocket.send_json({
|
675 |
"type": "state_change",
|
|
|
678 |
})
|
679 |
# STT'yi yeniden başlat
|
680 |
await session.restart_stt_if_needed()
|
681 |
+
|
682 |
+
elif action == "restart_stt":
|
683 |
+
# Manual STT restart request
|
684 |
+
log_info(f"🔄 Manual STT restart requested", session_id=session.session.session_id)
|
685 |
+
await session.stop_stt_streaming()
|
686 |
+
await session.restart_stt_if_needed()
|
687 |
|
688 |
# ========================= PROCESSING FUNCTIONS =========================
|
689 |
async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
690 |
"""Process complete user input"""
|
691 |
try:
|
692 |
+
# LLM işlemesi başlamadan önce STT'nin tamamen durduğundan emin ol
|
693 |
+
await session.stop_stt_streaming()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
694 |
|
695 |
# WebSocket aktif mi kontrol et
|
696 |
if not session.is_websocket_active:
|
|
|
698 |
|
699 |
user_text = session.current_transcription
|
700 |
if not user_text:
|
701 |
+
log_warning(f"⚠️ Empty transcription, returning to listening", session_id=session.session.session_id)
|
702 |
+
# Boş transcription durumunda listening'e dön ve STT'yi yeniden başlat
|
703 |
await session.change_state(ConversationState.LISTENING)
|
704 |
+
await session.restart_stt_if_needed()
|
705 |
return
|
706 |
|
707 |
log_info(f"🎯 Processing user input", text=user_text, session_id=session.session.session_id)
|
|
|
762 |
|
763 |
log_info(f"🎵 Starting TTS generation for response", session_id=session.session.session_id)
|
764 |
|
765 |
+
# Generate TTS (barge-in devre dışı)
|
766 |
+
await generate_and_stream_tts(websocket, session, tts_provider, response_text)
|
767 |
+
|
768 |
+
# TTS bittikten sonra LISTENING state'ine geç
|
769 |
+
await session.change_state(ConversationState.LISTENING)
|
770 |
+
if session.is_websocket_active:
|
771 |
+
await websocket.send_json({
|
772 |
+
"type": "state_change",
|
773 |
+
"from": "playing_audio",
|
774 |
+
"to": "listening"
|
775 |
+
})
|
776 |
+
|
777 |
+
# STT'yi yeniden başlat
|
778 |
+
log_info(f"🔄 Restarting STT after TTS completion", session_id=session.session.session_id)
|
779 |
+
await session.restart_stt_if_needed()
|
780 |
|
|
|
|
|
|
|
|
|
781 |
else:
|
782 |
log_info(f"⚠️ No TTS provider or WebSocket inactive", session_id=session.session.session_id)
|
783 |
+
# No TTS, go back to listening and restart STT
|
784 |
await session.change_state(ConversationState.LISTENING)
|
785 |
if session.is_websocket_active:
|
786 |
await websocket.send_json({
|
|
|
788 |
"from": "processing_llm",
|
789 |
"to": "listening"
|
790 |
})
|
791 |
+
await session.restart_stt_if_needed()
|
792 |
|
793 |
except Exception as e:
|
794 |
log_error(
|
|
|
803 |
"message": f"Processing error: {str(e)}"
|
804 |
})
|
805 |
await session.reset_for_new_utterance()
|
806 |
+
# Hata durumunda listening'e dön ve STT'yi yeniden başlat
|
807 |
await session.change_state(ConversationState.LISTENING)
|
808 |
+
await session.restart_stt_if_needed()
|
809 |
|
810 |
async def generate_and_stream_tts(
|
811 |
websocket: WebSocket,
|
|
|
813 |
tts_provider,
|
814 |
text: str
|
815 |
):
|
816 |
+
"""Generate and stream TTS audio with sequential processing"""
|
817 |
try:
|
818 |
+
# TTS başlamadan önce STT'nin tamamen durduğundan emin ol
|
819 |
+
await session.stop_stt_streaming()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
820 |
|
821 |
log_info(f"🎤 Starting TTS generation for text: '{text[:50]}...'", session_id=session.session.session_id)
|
822 |
|
|
|
865 |
log_info(f"📤 Streaming TTS audio: {len(audio_data)} bytes as {total_length} base64 chars in {total_chunks} chunks", session_id=session.session.session_id)
|
866 |
|
867 |
for i in range(0, total_length, chunk_size):
|
|
|
|
|
|
|
|
|
|
|
868 |
# WebSocket aktif mi kontrol et
|
869 |
if not session.is_websocket_active:
|
870 |
log_warning(f"⚠️ WebSocket inactive during streaming, stopping", session_id=session.session.session_id)
|
|
|
896 |
audio_size=len(audio_data),
|
897 |
chunks_sent=total_chunks
|
898 |
)
|
|
|
|
|
|
|
|
|
|
|
899 |
|
900 |
+
# TTS bitimi - state değişimi process_user_input'ta yapılacak
|
901 |
+
|
|
|
902 |
except Exception as e:
|
903 |
error_msg = str(e)
|
904 |
log_error(
|
|
|
923 |
"message": f"TTS error: {error_msg}"
|
924 |
})
|
925 |
|
926 |
+
# TTS hatası durumunda listening'e dön
|
927 |
+
await session.change_state(ConversationState.LISTENING)
|
928 |
if session.is_websocket_active:
|
929 |
await websocket.send_json({
|
930 |
"type": "state_change",
|
931 |
"from": "processing_tts",
|
932 |
+
"to": "listening"
|
933 |
+
})
|
934 |
+
# STT'yi yeniden başlat
|
935 |
+
await session.restart_stt_if_needed()
|