ciyidogan commited on
Commit
872bc77
·
verified ·
1 Parent(s): c59f476

Update websocket_handler.py

Browse files
Files changed (1) hide show
  1. websocket_handler.py +134 -119
websocket_handler.py CHANGED
@@ -153,7 +153,7 @@ class RealtimeSession:
153
  def __init__(self, session: Session):
154
  self.session = session
155
  self.state = ConversationState.IDLE
156
- self.is_websocket_active = True # Yeni flag
157
 
158
  # Get settings from config
159
  config = ConfigProvider.get().global_config.stt_provider.settings
@@ -178,9 +178,15 @@ class RealtimeSession:
178
  self.audio_chunk_size = config.get("audio_chunk_size", DEFAULT_AUDIO_CHUNK_SIZE)
179
  self.silence_threshold_ms = silence_threshold
180
 
 
 
 
181
  async def initialize_stt(self):
182
  """Initialize STT provider"""
183
  try:
 
 
 
184
  self.stt_manager = STTFactory.create_provider()
185
  if not self.stt_manager:
186
  log_error("❌ STT manager is None - STTFactory.create_provider() returned None", session_id=self.session.session_id)
@@ -192,7 +198,7 @@ class RealtimeSession:
192
  config = ConfigProvider.get().global_config.stt_provider.settings
193
 
194
  # Get language from session locale
195
- session_locale = getattr(self.session, 'locale', 'tr') # Default to 'tr' if not set
196
 
197
  # Import LocaleManager to get proper locale tag
198
  from locale_manager import LocaleManager
@@ -225,31 +231,44 @@ class RealtimeSession:
225
  log_error(f"❌ Failed to initialize STT", error=str(e), traceback=traceback.format_exc(), session_id=self.session.session_id)
226
  self.stt_manager = None
227
  self.is_streaming = False
 
228
  return False
229
 
230
  async def restart_stt_if_needed(self):
231
- """Restart STT if it's not active - sadece gerektiğinde"""
232
  try:
 
233
  if not self.is_streaming and self.is_websocket_active and self.state == ConversationState.LISTENING:
234
- log_info(f"🔄 Restarting STT stream after timeout...", session_id=self.session.session_id)
 
 
 
235
 
236
- # Mevcut STT manager'ı kullan
237
- if self.stt_manager:
238
- # Yeniden başlat
239
- stt_initialized = await self.initialize_stt()
240
- if stt_initialized:
241
- log_info(f"✅ STT stream restarted successfully", session_id=self.session.session_id)
242
- # Reset chunk counter
243
- if hasattr(self, 'chunk_counter'):
244
- self.chunk_counter = 0
245
- return True
246
- else:
247
- log_error(f"❌ Failed to restart STT stream", session_id=self.session.session_id)
248
- return False
249
  return True
250
  except Exception as e:
251
  log_error(f"❌ Error restarting STT", error=str(e), session_id=self.session.session_id)
252
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
  async def change_state(self, new_state: ConversationState):
255
  """Change conversation state"""
@@ -263,23 +282,25 @@ class RealtimeSession:
263
 
264
  async def handle_barge_in(self):
265
  """Handle user interruption"""
266
- await self.barge_in_handler.handle_interruption(self.state)
267
- await self.change_state(ConversationState.LISTENING)
 
268
 
269
  async def reset_for_new_utterance(self):
270
  """Reset for new user utterance"""
271
  await self.audio_buffer.clear()
272
  self.silence_detector.reset()
273
  self.current_transcription = ""
 
274
  if hasattr(self, 'speech_started'):
275
- delattr(self, 'speech_started') # Speech started flag'ini sıfırla
 
276
 
277
  async def cleanup(self):
278
  """Clean up resources"""
279
  try:
280
- self.is_websocket_active = False # WebSocket kapanıyor
281
- if self.stt_manager:
282
- await self.stt_manager.stop_streaming()
283
  log_info(f"Cleaned up realtime session", session_id=self.session.session_id)
284
  except Exception as e:
285
  log_warning(f"Cleanup error", error=str(e), session_id=self.session.session_id)
@@ -480,7 +501,7 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
480
 
481
  # ========================= MESSAGE HANDLERS =========================
482
  async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
483
- """Handle incoming audio chunk with barge-in support"""
484
  try:
485
  # WebSocket kapandıysa işlem yapma
486
  if not session.is_websocket_active:
@@ -491,14 +512,11 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
491
  log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
492
  return
493
 
494
- # Check for barge-in during TTS/audio playback
495
- if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS]:
496
- await session.handle_barge_in()
497
- await websocket.send_json({
498
- "type": "control",
499
- "action": "stop_playback"
500
- })
501
- log_info(f"🛑 Barge-in detected", session_id=session.session.session_id, state=session.state.value)
502
 
503
  # Change state to listening if idle
504
  if session.state == ConversationState.IDLE:
@@ -508,7 +526,14 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
508
  "from": "idle",
509
  "to": "listening"
510
  })
 
 
 
511
 
 
 
 
 
512
  # Add to buffer
513
  await session.audio_buffer.add_chunk(audio_data)
514
 
@@ -518,14 +543,13 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
518
  # Check silence
519
  silence_duration = session.silence_detector.update(decoded_audio)
520
 
521
- # Stream to STT if available
522
  if session.stt_manager and session.state == ConversationState.LISTENING:
523
  # Ensure streaming is active
524
  if not session.is_streaming:
525
- log_warning(f"⚠️ STT manager exists but streaming not active", session_id=session.session.session_id)
526
- # Try to restart streaming
527
- stt_initialized = await session.initialize_stt()
528
- if not stt_initialized:
529
  await websocket.send_json({
530
  "type": "error",
531
  "error_type": "stt_error",
@@ -534,9 +558,7 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
534
  return
535
 
536
  try:
537
- # Chunk counter
538
- if not hasattr(session, 'chunk_counter'):
539
- session.chunk_counter = 0
540
  session.chunk_counter += 1
541
 
542
  if session.chunk_counter == 1:
@@ -560,9 +582,12 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
560
 
561
  session.current_transcription = result.text
562
 
563
- # Final transcription geldiğinde hemen işle
564
  if session.current_transcription:
565
- # State'i değiştir ve user input'u işle
 
 
 
566
  await session.change_state(ConversationState.PROCESSING_STT)
567
  await websocket.send_json({
568
  "type": "state_change",
@@ -576,31 +601,31 @@ async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, mes
576
  # Reset for new utterance
577
  await session.reset_for_new_utterance()
578
  return
579
-
580
- # Interim result'ları artık göndermiyoruz ve loglama yapmıyoruz
581
 
582
  except Exception as e:
583
- log_error(f"❌ STT streaming error", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
584
- await websocket.send_json({
585
- "type": "error",
586
- "error_type": "stt_error",
587
- "message": f"STT error: {str(e)}"
588
- })
 
 
 
 
 
 
 
 
 
589
 
590
  except Exception as e:
591
- error_msg = str(e)
592
- # Google STT timeout hatası kontrolü
593
- if "Audio Timeout Error" in error_msg or "stream duration" in error_msg:
594
- log_warning(f"⚠️ STT timeout detected, marking stream as inactive", session_id=session.session.session_id)
595
- session.is_streaming = False
596
- # Timeout durumunda frontend'e hata gönderme, sessizce handle et
597
- else:
598
- log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
599
- await websocket.send_json({
600
- "type": "error",
601
- "error_type": "stt_error",
602
- "message": f"STT error: {str(e)}"
603
- })
604
 
605
  async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
606
  """Handle control messages"""
@@ -617,7 +642,7 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
617
  "config": {
618
  "silence_threshold_ms": session.silence_threshold_ms,
619
  "audio_chunk_size": session.audio_chunk_size,
620
- "supports_barge_in": True
621
  }
622
  })
623
 
@@ -627,16 +652,13 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
627
  await websocket.close()
628
 
629
  elif action == "interrupt":
630
- # Handle explicit interrupt
631
- await session.handle_barge_in()
632
- await websocket.send_json({
633
- "type": "control",
634
- "action": "interrupt_acknowledged"
635
- })
636
 
637
  elif action == "reset":
638
  # Reset conversation state
639
  await session.reset_for_new_utterance()
 
640
  await session.change_state(ConversationState.IDLE)
641
  await websocket.send_json({
642
  "type": "state_change",
@@ -647,6 +669,7 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
647
  elif action == "audio_ended":
648
  # Audio playback ended on client
649
  if session.state == ConversationState.PLAYING_AUDIO:
 
650
  await session.change_state(ConversationState.LISTENING)
651
  await websocket.send_json({
652
  "type": "state_change",
@@ -655,21 +678,19 @@ async def handle_control_message(websocket: WebSocket, session: RealtimeSession,
655
  })
656
  # STT'yi yeniden başlat
657
  await session.restart_stt_if_needed()
658
-
 
 
 
 
 
659
 
660
  # ========================= PROCESSING FUNCTIONS =========================
661
  async def process_user_input(websocket: WebSocket, session: RealtimeSession):
662
  """Process complete user input"""
663
  try:
664
- # LLM işlemesi sırasında STT'yi durdur
665
- if session.stt_manager and session.is_streaming:
666
- log_info(f"⏸️ Pausing STT during LLM processing", session_id=session.session.session_id)
667
- try:
668
- await session.stt_manager.stop_streaming()
669
- session.is_streaming = False
670
- except Exception as e:
671
- log_warning(f"⚠️ Error stopping STT: {e}", session_id=session.session.session_id)
672
- session.is_streaming = False
673
 
674
  # WebSocket aktif mi kontrol et
675
  if not session.is_websocket_active:
@@ -677,9 +698,10 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
677
 
678
  user_text = session.current_transcription
679
  if not user_text:
680
- log_warning(f"⚠️ Empty transcription, continuing listening", session_id=session.session.session_id)
681
- # Boş transcription'da bile listening'de kal
682
  await session.change_state(ConversationState.LISTENING)
 
683
  return
684
 
685
  log_info(f"🎯 Processing user input", text=user_text, session_id=session.session.session_id)
@@ -740,18 +762,25 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
740
 
741
  log_info(f"🎵 Starting TTS generation for response", session_id=session.session.session_id)
742
 
743
- # Generate TTS with barge-in support
744
- tts_task = await session.barge_in_handler.start_tts_task(
745
- generate_and_stream_tts(websocket, session, tts_provider, response_text)
746
- )
 
 
 
 
 
 
 
 
 
 
 
747
 
748
- try:
749
- await tts_task
750
- except asyncio.CancelledError:
751
- log_info("⚡ TTS cancelled due to barge-in", session_id=session.session.session_id)
752
  else:
753
  log_info(f"⚠️ No TTS provider or WebSocket inactive", session_id=session.session.session_id)
754
- # No TTS, go back to listening
755
  await session.change_state(ConversationState.LISTENING)
756
  if session.is_websocket_active:
757
  await websocket.send_json({
@@ -759,6 +788,7 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
759
  "from": "processing_llm",
760
  "to": "listening"
761
  })
 
762
 
763
  except Exception as e:
764
  log_error(
@@ -773,8 +803,9 @@ async def process_user_input(websocket: WebSocket, session: RealtimeSession):
773
  "message": f"Processing error: {str(e)}"
774
  })
775
  await session.reset_for_new_utterance()
776
- # Hata durumunda bile listening'e dön
777
  await session.change_state(ConversationState.LISTENING)
 
778
 
779
  async def generate_and_stream_tts(
780
  websocket: WebSocket,
@@ -782,17 +813,10 @@ async def generate_and_stream_tts(
782
  tts_provider,
783
  text: str
784
  ):
785
- """Generate and stream TTS audio with cancellation support"""
786
  try:
787
- # TTS başlamadan önce STT'yi durdur - timeout'u önle
788
- if session.stt_manager and session.is_streaming:
789
- log_info(f"⏸️ Pausing STT stream during TTS", session_id=session.session.session_id)
790
- try:
791
- await session.stt_manager.stop_streaming()
792
- session.is_streaming = False
793
- except Exception as e:
794
- log_warning(f"⚠️ Error stopping STT before TTS: {e}", session_id=session.session.session_id)
795
- session.is_streaming = False
796
 
797
  log_info(f"🎤 Starting TTS generation for text: '{text[:50]}...'", session_id=session.session.session_id)
798
 
@@ -841,11 +865,6 @@ async def generate_and_stream_tts(
841
  log_info(f"📤 Streaming TTS audio: {len(audio_data)} bytes as {total_length} base64 chars in {total_chunks} chunks", session_id=session.session.session_id)
842
 
843
  for i in range(0, total_length, chunk_size):
844
- # Check for cancellation
845
- if asyncio.current_task().cancelled():
846
- log_info(f"⚡ Streaming cancelled at chunk {i//chunk_size}", session_id=session.session.session_id)
847
- break
848
-
849
  # WebSocket aktif mi kontrol et
850
  if not session.is_websocket_active:
851
  log_warning(f"⚠️ WebSocket inactive during streaming, stopping", session_id=session.session.session_id)
@@ -877,15 +896,9 @@ async def generate_and_stream_tts(
877
  audio_size=len(audio_data),
878
  chunks_sent=total_chunks
879
  )
880
-
881
- # TTS bitiminde STT'yi yeniden başlat
882
- if session.state == ConversationState.LISTENING:
883
- log_info(f"🔄 Restarting STT after TTS completion", session_id=session.session.session_id)
884
- await session.restart_stt_if_needed()
885
 
886
- except asyncio.CancelledError:
887
- log_info("🛑 TTS streaming cancelled", session_id=session.session.session_id)
888
- raise
889
  except Exception as e:
890
  error_msg = str(e)
891
  log_error(
@@ -910,11 +923,13 @@ async def generate_and_stream_tts(
910
  "message": f"TTS error: {error_msg}"
911
  })
912
 
913
- # TTS hatası durumunda idle'a dön
914
- await session.change_state(ConversationState.IDLE)
915
  if session.is_websocket_active:
916
  await websocket.send_json({
917
  "type": "state_change",
918
  "from": "processing_tts",
919
- "to": "idle"
920
- })
 
 
 
153
  def __init__(self, session: Session):
154
  self.session = session
155
  self.state = ConversationState.IDLE
156
+ self.is_websocket_active = True
157
 
158
  # Get settings from config
159
  config = ConfigProvider.get().global_config.stt_provider.settings
 
178
  self.audio_chunk_size = config.get("audio_chunk_size", DEFAULT_AUDIO_CHUNK_SIZE)
179
  self.silence_threshold_ms = silence_threshold
180
 
181
+ # Chunk counter için attribute
182
+ self.chunk_counter = 0
183
+
184
  async def initialize_stt(self):
185
  """Initialize STT provider"""
186
  try:
187
+ # Her başlatmada chunk counter'ı sıfırla
188
+ self.chunk_counter = 0
189
+
190
  self.stt_manager = STTFactory.create_provider()
191
  if not self.stt_manager:
192
  log_error("❌ STT manager is None - STTFactory.create_provider() returned None", session_id=self.session.session_id)
 
198
  config = ConfigProvider.get().global_config.stt_provider.settings
199
 
200
  # Get language from session locale
201
+ session_locale = getattr(self.session, 'locale', 'tr')
202
 
203
  # Import LocaleManager to get proper locale tag
204
  from locale_manager import LocaleManager
 
231
  log_error(f"❌ Failed to initialize STT", error=str(e), traceback=traceback.format_exc(), session_id=self.session.session_id)
232
  self.stt_manager = None
233
  self.is_streaming = False
234
+ self.chunk_counter = 0
235
  return False
236
 
237
  async def restart_stt_if_needed(self):
238
+ """Restart STT if it's not active"""
239
  try:
240
+ # Sadece LISTENING state'inde ve WebSocket aktifse restart yap
241
  if not self.is_streaming and self.is_websocket_active and self.state == ConversationState.LISTENING:
242
+ log_info(f"🔄 Restarting STT stream...", session_id=self.session.session_id)
243
+
244
+ # Önce mevcut stream'i temizle
245
+ await self.stop_stt_streaming()
246
 
247
+ # Sonra yeniden başlat
248
+ stt_initialized = await self.initialize_stt()
249
+ if stt_initialized:
250
+ log_info(f"✅ STT stream restarted successfully", session_id=self.session.session_id)
251
+ return True
252
+ else:
253
+ log_error(f"❌ Failed to restart STT stream", session_id=self.session.session_id)
254
+ return False
 
 
 
 
 
255
  return True
256
  except Exception as e:
257
  log_error(f"❌ Error restarting STT", error=str(e), session_id=self.session.session_id)
258
  return False
259
+
260
+ async def stop_stt_streaming(self):
261
+ """Stop STT streaming completely"""
262
+ try:
263
+ if self.stt_manager and self.is_streaming:
264
+ log_info(f"🛑 Stopping STT stream", session_id=self.session.session_id)
265
+ await self.stt_manager.stop_streaming()
266
+ self.is_streaming = False
267
+ self.chunk_counter = 0
268
+ log_info(f"✅ STT stream stopped", session_id=self.session.session_id)
269
+ except Exception as e:
270
+ log_warning(f"⚠️ Error stopping STT stream: {e}", session_id=self.session.session_id)
271
+ self.is_streaming = False
272
 
273
  async def change_state(self, new_state: ConversationState):
274
  """Change conversation state"""
 
282
 
283
  async def handle_barge_in(self):
284
  """Handle user interruption"""
285
+ # Barge-in devre dışı - bu metod artık çağrılmamalı
286
+ log_warning(f"⚠️ Barge-in called but disabled", session_id=self.session.session_id)
287
+ return
288
 
289
  async def reset_for_new_utterance(self):
290
  """Reset for new user utterance"""
291
  await self.audio_buffer.clear()
292
  self.silence_detector.reset()
293
  self.current_transcription = ""
294
+ self.chunk_counter = 0 # Chunk counter'ı reset et
295
  if hasattr(self, 'speech_started'):
296
+ delattr(self, 'speech_started')
297
+ log_info(f"🔄 Reset for new utterance complete", session_id=self.session.session_id)
298
 
299
  async def cleanup(self):
300
  """Clean up resources"""
301
  try:
302
+ self.is_websocket_active = False
303
+ await self.stop_stt_streaming() # STT'yi düzgün durdur
 
304
  log_info(f"Cleaned up realtime session", session_id=self.session.session_id)
305
  except Exception as e:
306
  log_warning(f"Cleanup error", error=str(e), session_id=self.session.session_id)
 
501
 
502
  # ========================= MESSAGE HANDLERS =========================
503
  async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
504
+ """Handle incoming audio chunk with sequential processing"""
505
  try:
506
  # WebSocket kapandıysa işlem yapma
507
  if not session.is_websocket_active:
 
512
  log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
513
  return
514
 
515
+ # Barge-in devre dışı - TTS/audio playback sırasında audio chunk'ları işleme
516
+ if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS,
517
+ ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]:
518
+ log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id)
519
+ return
 
 
 
520
 
521
  # Change state to listening if idle
522
  if session.state == ConversationState.IDLE:
 
526
  "from": "idle",
527
  "to": "listening"
528
  })
529
+ # IDLE'dan LISTENING'e geçerken STT'yi başlat
530
+ if not session.is_streaming:
531
+ await session.restart_stt_if_needed()
532
 
533
+ # LISTENING state'inde değilse audio işleme
534
+ if session.state != ConversationState.LISTENING:
535
+ return
536
+
537
  # Add to buffer
538
  await session.audio_buffer.add_chunk(audio_data)
539
 
 
543
  # Check silence
544
  silence_duration = session.silence_detector.update(decoded_audio)
545
 
546
+ # Stream to STT if available and in LISTENING state
547
  if session.stt_manager and session.state == ConversationState.LISTENING:
548
  # Ensure streaming is active
549
  if not session.is_streaming:
550
+ log_warning(f"⚠️ STT not streaming, attempting to restart", session_id=session.session.session_id)
551
+ restart_success = await session.restart_stt_if_needed()
552
+ if not restart_success:
 
553
  await websocket.send_json({
554
  "type": "error",
555
  "error_type": "stt_error",
 
558
  return
559
 
560
  try:
561
+ # Chunk counter artır
 
 
562
  session.chunk_counter += 1
563
 
564
  if session.chunk_counter == 1:
 
582
 
583
  session.current_transcription = result.text
584
 
585
+ # Final transcription geldiğinde STT'yi durdur ve işle
586
  if session.current_transcription:
587
+ # Önce STT'yi durdur
588
+ await session.stop_stt_streaming()
589
+
590
+ # State'i değiştir
591
  await session.change_state(ConversationState.PROCESSING_STT)
592
  await websocket.send_json({
593
  "type": "state_change",
 
601
  # Reset for new utterance
602
  await session.reset_for_new_utterance()
603
  return
 
 
604
 
605
  except Exception as e:
606
+ error_msg = str(e)
607
+ # Google STT timeout hatası kontrolü
608
+ if "Audio Timeout Error" in error_msg or "stream duration" in error_msg or "Exceeded maximum allowed stream duration" in error_msg:
609
+ log_warning(f"⚠️ STT timeout detected, restarting stream", session_id=session.session.session_id)
610
+ session.is_streaming = False
611
+ session.chunk_counter = 0
612
+ # Timeout durumunda yeniden başlat
613
+ await session.restart_stt_if_needed()
614
+ else:
615
+ log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
616
+ await websocket.send_json({
617
+ "type": "error",
618
+ "error_type": "stt_error",
619
+ "message": f"STT error: {str(e)}"
620
+ })
621
 
622
  except Exception as e:
623
+ log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
624
+ await websocket.send_json({
625
+ "type": "error",
626
+ "error_type": "audio_error",
627
+ "message": f"Audio processing error: {str(e)}"
628
+ })
 
 
 
 
 
 
 
629
 
630
  async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
631
  """Handle control messages"""
 
642
  "config": {
643
  "silence_threshold_ms": session.silence_threshold_ms,
644
  "audio_chunk_size": session.audio_chunk_size,
645
+ "supports_barge_in": False # Barge-in devre dışı
646
  }
647
  })
648
 
 
652
  await websocket.close()
653
 
654
  elif action == "interrupt":
655
+ # Barge-in devre dışı - ignore
656
+ log_warning(f"⚠️ Interrupt request ignored (barge-in disabled)", session_id=session.session.session_id)
 
 
 
 
657
 
658
  elif action == "reset":
659
  # Reset conversation state
660
  await session.reset_for_new_utterance()
661
+ await session.stop_stt_streaming()
662
  await session.change_state(ConversationState.IDLE)
663
  await websocket.send_json({
664
  "type": "state_change",
 
669
  elif action == "audio_ended":
670
  # Audio playback ended on client
671
  if session.state == ConversationState.PLAYING_AUDIO:
672
+ log_info(f"🎵 Client reported audio ended", session_id=session.session.session_id)
673
  await session.change_state(ConversationState.LISTENING)
674
  await websocket.send_json({
675
  "type": "state_change",
 
678
  })
679
  # STT'yi yeniden başlat
680
  await session.restart_stt_if_needed()
681
+
682
+ elif action == "restart_stt":
683
+ # Manual STT restart request
684
+ log_info(f"🔄 Manual STT restart requested", session_id=session.session.session_id)
685
+ await session.stop_stt_streaming()
686
+ await session.restart_stt_if_needed()
687
 
688
  # ========================= PROCESSING FUNCTIONS =========================
689
  async def process_user_input(websocket: WebSocket, session: RealtimeSession):
690
  """Process complete user input"""
691
  try:
692
+ # LLM işlemesi başlamadan önce STT'nin tamamen durduğundan emin ol
693
+ await session.stop_stt_streaming()
 
 
 
 
 
 
 
694
 
695
  # WebSocket aktif mi kontrol et
696
  if not session.is_websocket_active:
 
698
 
699
  user_text = session.current_transcription
700
  if not user_text:
701
+ log_warning(f"⚠️ Empty transcription, returning to listening", session_id=session.session.session_id)
702
+ # Boş transcription durumunda listening'e dön ve STT'yi yeniden başlat
703
  await session.change_state(ConversationState.LISTENING)
704
+ await session.restart_stt_if_needed()
705
  return
706
 
707
  log_info(f"🎯 Processing user input", text=user_text, session_id=session.session.session_id)
 
762
 
763
  log_info(f"🎵 Starting TTS generation for response", session_id=session.session.session_id)
764
 
765
+ # Generate TTS (barge-in devre dışı)
766
+ await generate_and_stream_tts(websocket, session, tts_provider, response_text)
767
+
768
+ # TTS bittikten sonra LISTENING state'ine geç
769
+ await session.change_state(ConversationState.LISTENING)
770
+ if session.is_websocket_active:
771
+ await websocket.send_json({
772
+ "type": "state_change",
773
+ "from": "playing_audio",
774
+ "to": "listening"
775
+ })
776
+
777
+ # STT'yi yeniden başlat
778
+ log_info(f"🔄 Restarting STT after TTS completion", session_id=session.session.session_id)
779
+ await session.restart_stt_if_needed()
780
 
 
 
 
 
781
  else:
782
  log_info(f"⚠️ No TTS provider or WebSocket inactive", session_id=session.session.session_id)
783
+ # No TTS, go back to listening and restart STT
784
  await session.change_state(ConversationState.LISTENING)
785
  if session.is_websocket_active:
786
  await websocket.send_json({
 
788
  "from": "processing_llm",
789
  "to": "listening"
790
  })
791
+ await session.restart_stt_if_needed()
792
 
793
  except Exception as e:
794
  log_error(
 
803
  "message": f"Processing error: {str(e)}"
804
  })
805
  await session.reset_for_new_utterance()
806
+ # Hata durumunda listening'e dön ve STT'yi yeniden başlat
807
  await session.change_state(ConversationState.LISTENING)
808
+ await session.restart_stt_if_needed()
809
 
810
  async def generate_and_stream_tts(
811
  websocket: WebSocket,
 
813
  tts_provider,
814
  text: str
815
  ):
816
+ """Generate and stream TTS audio with sequential processing"""
817
  try:
818
+ # TTS başlamadan önce STT'nin tamamen durduğundan emin ol
819
+ await session.stop_stt_streaming()
 
 
 
 
 
 
 
820
 
821
  log_info(f"🎤 Starting TTS generation for text: '{text[:50]}...'", session_id=session.session.session_id)
822
 
 
865
  log_info(f"📤 Streaming TTS audio: {len(audio_data)} bytes as {total_length} base64 chars in {total_chunks} chunks", session_id=session.session.session_id)
866
 
867
  for i in range(0, total_length, chunk_size):
 
 
 
 
 
868
  # WebSocket aktif mi kontrol et
869
  if not session.is_websocket_active:
870
  log_warning(f"⚠️ WebSocket inactive during streaming, stopping", session_id=session.session.session_id)
 
896
  audio_size=len(audio_data),
897
  chunks_sent=total_chunks
898
  )
 
 
 
 
 
899
 
900
+ # TTS bitimi - state değişimi process_user_input'ta yapılacak
901
+
 
902
  except Exception as e:
903
  error_msg = str(e)
904
  log_error(
 
923
  "message": f"TTS error: {error_msg}"
924
  })
925
 
926
+ # TTS hatası durumunda listening'e dön
927
+ await session.change_state(ConversationState.LISTENING)
928
  if session.is_websocket_active:
929
  await websocket.send_json({
930
  "type": "state_change",
931
  "from": "processing_tts",
932
+ "to": "listening"
933
+ })
934
+ # STT'yi yeniden başlat
935
+ await session.restart_stt_if_needed()