ciyidogan commited on
Commit
7c6f660
·
verified ·
1 Parent(s): 9dd77ec

Delete websocket_handler.py

Browse files
Files changed (1) hide show
  1. websocket_handler.py +0 -1070
websocket_handler.py DELETED
@@ -1,1070 +0,0 @@
1
- """
2
- WebSocket Handler for Real-time STT/TTS with Barge-in Support
3
- """
4
- from fastapi import WebSocket, WebSocketDisconnect
5
- from typing import Dict, Any, Optional
6
- import json
7
- import asyncio
8
- import base64
9
- from datetime import datetime
10
- from collections import deque
11
- from enum import Enum
12
- import numpy as np
13
- import traceback
14
-
15
- from session import Session, session_store
16
- from config_provider import ConfigProvider
17
- from chat_handler import handle_new_message, handle_parameter_followup
18
- from stt_factory import STTFactory
19
- from tts_factory import TTSFactory
20
- from logger import log_info, log_error, log_debug, log_warning
21
-
22
- # ========================= CONSTANTS =========================
23
- # Default values - will be overridden by config
24
- DEFAULT_SILENCE_THRESHOLD_MS = 2000
25
- DEFAULT_AUDIO_CHUNK_SIZE = 4096
26
- DEFAULT_ENERGY_THRESHOLD = 0.0005 # 0.01
27
- DEFAULT_AUDIO_BUFFER_MAX_SIZE = 1000
28
-
29
- # ========================= ENUMS =========================
30
- class ConversationState(Enum):
31
- IDLE = "idle"
32
- LISTENING = "listening"
33
- PROCESSING_STT = "processing_stt"
34
- PROCESSING_LLM = "processing_llm"
35
- PROCESSING_TTS = "processing_tts"
36
- PLAYING_AUDIO = "playing_audio"
37
-
38
- # ========================= CLASSES =========================
39
- class AudioBuffer:
40
- """Thread-safe circular buffer for audio chunks"""
41
- def __init__(self, max_size: int = DEFAULT_AUDIO_BUFFER_MAX_SIZE):
42
- self.buffer = deque(maxlen=max_size)
43
- self.lock = asyncio.Lock()
44
-
45
- async def add_chunk(self, chunk_data: str):
46
- """Add base64 encoded audio chunk"""
47
- async with self.lock:
48
- decoded = base64.b64decode(chunk_data)
49
- self.buffer.append(decoded)
50
-
51
- async def get_all_audio(self) -> bytes:
52
- """Get all audio data concatenated"""
53
- async with self.lock:
54
- return b''.join(self.buffer)
55
-
56
- async def clear(self):
57
- """Clear buffer"""
58
- async with self.lock:
59
- self.buffer.clear()
60
-
61
- def size(self) -> int:
62
- """Get current buffer size"""
63
- return len(self.buffer)
64
-
65
-
66
- class SilenceDetector:
67
- """Detect silence in audio stream"""
68
- def __init__(self, threshold_ms: int = DEFAULT_SILENCE_THRESHOLD_MS, energy_threshold: float = DEFAULT_ENERGY_THRESHOLD):
69
- self.threshold_ms = threshold_ms
70
- self.energy_threshold = energy_threshold
71
- self.silence_start = None
72
- self.sample_rate = 16000
73
-
74
- def update(self, audio_chunk: bytes) -> int:
75
- """Update with new audio chunk and return silence duration in ms"""
76
- if self.is_silence(audio_chunk):
77
- if self.silence_start is None:
78
- self.silence_start = datetime.now()
79
- silence_duration = (datetime.now() - self.silence_start).total_seconds() * 1000
80
- return int(silence_duration)
81
- else:
82
- self.silence_start = None
83
- return 0
84
-
85
- def is_silence(self, audio_chunk: bytes) -> bool:
86
- """Check if audio chunk is silence"""
87
- try:
88
- # Audio chunk boyutunu kontrol et
89
- if len(audio_chunk) == 0:
90
- return True
91
-
92
- # Chunk boyutu 2'nin katı olmalı (16-bit audio için)
93
- if len(audio_chunk) % 2 != 0:
94
- # Tek byte varsa, son byte'ı at
95
- audio_chunk = audio_chunk[:-1]
96
-
97
- # Convert bytes to numpy array (assuming 16-bit PCM)
98
- audio_data = np.frombuffer(audio_chunk, dtype=np.int16)
99
-
100
- # RMS hesapla
101
- if len(audio_data) == 0:
102
- return True
103
-
104
- rms = np.sqrt(np.mean(audio_data.astype(float) ** 2))
105
- normalized_rms = rms / 32768.0
106
-
107
- return normalized_rms < self.energy_threshold
108
-
109
- except Exception as e:
110
- log_warning(f"Silence detection error: {e}")
111
- return False
112
-
113
- def reset(self):
114
- """Reset silence detection"""
115
- self.silence_start = None
116
-
117
-
118
- class BargeInHandler:
119
- """Handle user interruptions during TTS playback"""
120
- def __init__(self):
121
- self.active_tts_task: Optional[asyncio.Task] = None
122
- self.is_interrupting = False
123
- self.lock = asyncio.Lock()
124
-
125
- async def start_tts_task(self, coro):
126
- """Start a cancellable TTS task"""
127
- async with self.lock:
128
- # Cancel any existing task
129
- if self.active_tts_task and not self.active_tts_task.done():
130
- self.active_tts_task.cancel()
131
- try:
132
- await self.active_tts_task
133
- except asyncio.CancelledError:
134
- pass
135
-
136
- # Start new task
137
- self.active_tts_task = asyncio.create_task(coro)
138
- return self.active_tts_task
139
-
140
- async def handle_interruption(self, current_state: ConversationState):
141
- """Handle barge-in interruption"""
142
- async with self.lock:
143
- self.is_interrupting = True
144
-
145
- # Cancel TTS if active
146
- if self.active_tts_task and not self.active_tts_task.done():
147
- log_info("Barge-in: Cancelling active TTS")
148
- self.active_tts_task.cancel()
149
- try:
150
- await self.active_tts_task
151
- except asyncio.CancelledError:
152
- pass
153
-
154
- # Reset flag after short delay
155
- await asyncio.sleep(0.5)
156
- self.is_interrupting = False
157
-
158
-
159
- class RealtimeSession:
160
- """Manage a real-time conversation session"""
161
- def __init__(self, session: Session):
162
- self.session = session
163
- self.state = ConversationState.IDLE
164
- self.is_websocket_active = True
165
-
166
- # Get settings from config
167
- config = ConfigProvider.get().global_config.stt_provider.settings
168
-
169
- # Initialize with config values or defaults
170
- silence_threshold = config.get("speech_timeout_ms", DEFAULT_SILENCE_THRESHOLD_MS)
171
- energy_threshold = config.get("energy_threshold", DEFAULT_ENERGY_THRESHOLD)
172
- buffer_max_size = config.get("audio_buffer_max_size", DEFAULT_AUDIO_BUFFER_MAX_SIZE)
173
-
174
- self.audio_buffer = AudioBuffer(max_size=buffer_max_size)
175
- self.silence_detector = SilenceDetector(
176
- threshold_ms=silence_threshold,
177
- energy_threshold=energy_threshold
178
- )
179
- self.barge_in_handler = BargeInHandler()
180
- self.stt_manager = None
181
- self.current_transcription = ""
182
- self.is_streaming = False
183
- self.lock = asyncio.Lock()
184
-
185
- # Store config for later use
186
- self.audio_chunk_size = config.get("audio_chunk_size", DEFAULT_AUDIO_CHUNK_SIZE)
187
- self.silence_threshold_ms = silence_threshold
188
-
189
- # Chunk counter için attribute
190
- self.chunk_counter = 0
191
-
192
- # Session management - YENİ
193
- self.stt_session_count = 0
194
- self.last_stt_stop_time = None
195
-
196
- async def initialize_stt(self):
197
- """Initialize STT provider with clean state"""
198
- try:
199
- # Session numarasını artır
200
- self.stt_session_count += 1
201
- log_info(f"🎤 Initializing STT session #{self.stt_session_count}", session_id=self.session.session_id)
202
-
203
- # Önce mevcut STT'yi tamamen temizle
204
- await self.stop_stt_streaming()
205
-
206
- # Önceki stop'tan bu yana yeterli zaman geçtiğinden emin ol
207
- if self.last_stt_stop_time:
208
- elapsed = (datetime.now() - self.last_stt_stop_time).total_seconds()
209
- if elapsed < 0.5:
210
- wait_time = 0.5 - elapsed
211
- log_info(f"⏳ Waiting {wait_time:.2f}s for proper cleanup", session_id=self.session.session_id)
212
- await asyncio.sleep(wait_time)
213
-
214
- # Tüm değişkenleri yeniden başlat
215
- self.chunk_counter = 0
216
- self.current_transcription = ""
217
- await self.audio_buffer.clear()
218
- self.silence_detector.reset()
219
-
220
- # Yeni STT instance oluştur
221
- self.stt_manager = STTFactory.create_provider()
222
- if not self.stt_manager:
223
- log_error("❌ STT manager is None - STTFactory.create_provider() returned None", session_id=self.session.session_id)
224
- return False
225
-
226
- log_info(f"✅ STT manager created: {type(self.stt_manager).__name__}", session_id=self.session.session_id)
227
-
228
- # Get STT config from provider settings
229
- config = ConfigProvider.get().global_config.stt_provider.settings
230
-
231
- # Get language from session locale
232
- session_locale = getattr(self.session, 'locale', 'tr')
233
-
234
- # Import LocaleManager to get proper locale tag
235
- from locale_manager import LocaleManager
236
- locale_data = LocaleManager.get_locale(session_locale)
237
-
238
- # Get proper locale tag for STT (e.g., tr -> tr-TR)
239
- language_code = locale_data.get('locale_tag', 'tr-TR')
240
-
241
- log_info(f"🌍 Session locale: {session_locale}, STT language: {language_code}", session_id=self.session.session_id)
242
-
243
- # single_utterance'ı false yap - sürekli dinleme için
244
- stt_config = {
245
- "language": language_code,
246
- "interim_results": config.get("interim_results", True),
247
- "single_utterance": False, # Sürekli dinleme için false
248
- "enable_punctuation": config.get("enable_punctuation", True),
249
- "sample_rate": 16000,
250
- "encoding": "WEBM_OPUS"
251
- }
252
-
253
- log_info(f"🎤 Starting STT streaming with config: {stt_config}", session_id=self.session.session_id)
254
-
255
- # Start streaming
256
- await self.stt_manager.start_streaming(stt_config)
257
- self.is_streaming = True
258
-
259
- log_info("✅ STT streaming started successfully with clean state", session_id=self.session.session_id)
260
- return True
261
-
262
- except Exception as e:
263
- log_error(f"❌ Failed to initialize STT", error=str(e), traceback=traceback.format_exc(), session_id=self.session.session_id)
264
- # Hata durumunda da temizlik yap
265
- await self.stop_stt_streaming()
266
- return False
267
-
268
- async def stop_stt_streaming(self):
269
- """Stop STT streaming completely and reset all STT-related variables"""
270
- try:
271
- log_info(f"🛑 Stopping STT session #{self.stt_session_count}", session_id=self.session.session_id)
272
-
273
- # STT manager varsa durdur
274
- if self.stt_manager:
275
- if self.is_streaming:
276
- try:
277
- await self.stt_manager.stop_streaming()
278
- except Exception as e:
279
- log_warning(f"⚠️ Error during STT stop_streaming: {e}", session_id=self.session.session_id)
280
-
281
- # STT manager'ı tamamen sil
282
- self.stt_manager = None
283
-
284
- # Tüm STT ile ilgili değişkenleri resetle
285
- self.is_streaming = False
286
- self.chunk_counter = 0
287
-
288
- # Audio buffer'ı temizle
289
- await self.audio_buffer.clear()
290
-
291
- # Silence detector'ı resetle
292
- self.silence_detector.reset()
293
-
294
- # Speech started flag'ini temizle
295
- if hasattr(self, 'speech_started'):
296
- delattr(self, 'speech_started')
297
-
298
- # Stop zamanını kaydet
299
- self.last_stt_stop_time = datetime.now()
300
-
301
- log_info(f"✅ STT session #{self.stt_session_count} stopped and all data reset", session_id=self.session.session_id)
302
-
303
- except Exception as e:
304
- log_error(f"❌ Error in stop_stt_streaming", error=str(e), session_id=self.session.session_id)
305
- # Hata olsa bile değişkenleri resetle
306
- self.stt_manager = None
307
- self.is_streaming = False
308
- self.chunk_counter = 0
309
- if self.audio_buffer:
310
- await self.audio_buffer.clear()
311
- if self.silence_detector:
312
- self.silence_detector.reset()
313
- self.last_stt_stop_time = datetime.now()
314
-
315
- async def restart_stt_if_needed(self):
316
- """Restart STT streaming if needed"""
317
- try:
318
- # STT yoksa baştan oluştur
319
- if not self.stt_manager:
320
- await self.create_stt_manager()
321
- if not self.stt_manager:
322
- log_error(f"❌ Failed to create STT manager", session_id=self.session.session_id)
323
- return False
324
-
325
- # Streaming başlat
326
- config = ConfigProvider.get().global_config.stt_provider.settings
327
- stt_config = {
328
- 'language': self.get_stt_language(),
329
- 'interim_results': True,
330
- 'single_utterance': False, # Continuous listening için False
331
- 'enable_punctuation': True,
332
- 'sample_rate': 16000,
333
- 'encoding': 'LINEAR16' # WEBM_OPUS yerine LINEAR16 kullan
334
- }
335
-
336
- await self.stt_manager.start_streaming(stt_config)
337
- self.is_streaming = True
338
-
339
- log_info(f"✅ STT streaming started successfully with clean state", session_id=self.session.session_id)
340
- return True
341
-
342
- except Exception as e:
343
- log_error(f"❌ Failed to restart STT", error=str(e), traceback=traceback.format_exc(), session_id=self.session.session_id)
344
- self.is_streaming = False
345
- return False
346
-
347
- async def change_state(self, new_state: ConversationState):
348
- """Change conversation state"""
349
- async with self.lock:
350
- old_state = self.state
351
- self.state = new_state
352
- log_debug(
353
- f"State change: {old_state.value} → {new_state.value}",
354
- session_id=self.session.session_id
355
- )
356
-
357
- async def handle_barge_in(self):
358
- """Handle user interruption"""
359
- # Barge-in devre dışı - bu metod artık çağrılmamalı
360
- log_warning(f"⚠️ Barge-in called but disabled", session_id=self.session.session_id)
361
- return
362
-
363
- async def reset_for_new_utterance(self):
364
- """Reset for new user utterance"""
365
- log_info(f"🔄 Resetting for new utterance", session_id=self.session.session_id)
366
-
367
- # Buffer ve detector'ı temizle
368
- await self.audio_buffer.clear()
369
- self.silence_detector.reset()
370
-
371
- # Transcription ve counter'ı sıfırla
372
- self.current_transcription = ""
373
- self.chunk_counter = 0
374
-
375
- # Speech started flag'ini temizle
376
- if hasattr(self, 'speech_started'):
377
- delattr(self, 'speech_started')
378
-
379
- log_info(f"✅ Reset for new utterance complete", session_id=self.session.session_id)
380
-
381
- async def cleanup(self):
382
- """Clean up resources"""
383
- try:
384
- self.is_websocket_active = False
385
- await self.stop_stt_streaming() # STT'yi düzgün durdur
386
- log_info(f"Cleaned up realtime session", session_id=self.session.session_id)
387
- except Exception as e:
388
- log_warning(f"Cleanup error", error=str(e), session_id=self.session.session_id)
389
-
390
- # ========================= MESSAGE HANDLERS =========================
391
- async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
392
- """Handle control messages"""
393
- action = message.get("action")
394
- config = message.get("config", {})
395
-
396
- log_debug(f"🎮 Control message", action=action, session_id=session.session.session_id)
397
-
398
- if action == "start_session":
399
- # Session configuration
400
- await websocket.send_json({
401
- "type": "session_config",
402
- "session_id": session.session.session_id,
403
- "config": {
404
- "silence_threshold_ms": session.silence_threshold_ms,
405
- "audio_chunk_size": session.audio_chunk_size,
406
- "supports_barge_in": False # Barge-in devre dışı
407
- }
408
- })
409
-
410
- elif action == "end_session" or action == "stop_session":
411
- # Clean up and close
412
- await session.cleanup()
413
- await websocket.close()
414
-
415
- elif action == "interrupt":
416
- # Barge-in devre dışı - ignore
417
- log_warning(f"⚠️ Interrupt request ignored (barge-in disabled)", session_id=session.session.session_id)
418
-
419
- elif action == "reset":
420
- # Reset conversation state
421
- await session.reset_for_new_utterance()
422
- await session.stop_stt_streaming()
423
- await session.change_state(ConversationState.IDLE)
424
- await websocket.send_json({
425
- "type": "state_change",
426
- "from": session.state.value,
427
- "to": "idle"
428
- })
429
-
430
- elif action == "audio_ended":
431
- # Audio playback ended on client
432
- log_info(f"🎵 Client reported audio ended, current state: {session.state.value}", session_id=session.session.session_id)
433
-
434
- if session.state == ConversationState.PLAYING_AUDIO:
435
- # State'i listening'e çevir
436
- await session.change_state(ConversationState.LISTENING)
437
- await websocket.send_json({
438
- "type": "state_change",
439
- "from": "playing_audio",
440
- "to": "listening"
441
- })
442
-
443
- # STT'yi başlat
444
- log_info(f"🎤 Starting STT after audio playback ended", session_id=session.session.session_id)
445
-
446
- # Önce mevcut STT varsa temizle
447
- if session.stt_manager:
448
- await session.stop_stt_streaming()
449
- await asyncio.sleep(0.1) # Kısa bekleme
450
-
451
- # Yeni STT başlat
452
- success = await session.initialize_stt()
453
-
454
- # STT hazır olduğunda sinyal gönder
455
- if success and session.is_streaming:
456
- log_info(f"✅ Sending STT ready signal", session_id=session.session.session_id)
457
- await websocket.send_json({
458
- "type": "stt_ready",
459
- "message": "STT is ready to receive audio"
460
- })
461
- else:
462
- log_error(f"❌ STT initialization failed", session_id=session.session.session_id)
463
- await websocket.send_json({
464
- "type": "error",
465
- "error_type": "stt_init_failed",
466
- "message": "Failed to initialize STT after audio playback"
467
- })
468
- else:
469
- log_warning(f"⚠️ audio_ended received but state is not playing_audio: {session.state.value}", session_id=session.session.session_id)
470
-
471
- elif action == "restart_stt":
472
- # Manual STT restart request
473
- log_info(f"🔄 Manual STT restart requested", session_id=session.session.session_id)
474
- await session.stop_stt_streaming()
475
- await session.restart_stt_if_needed()
476
-
477
- async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
478
- """Handle incoming audio chunk with sequential processing"""
479
- try:
480
- # WebSocket kapandıysa işlem yapma
481
- if not session.is_websocket_active:
482
- return
483
-
484
- audio_data = message.get("data")
485
- if not audio_data:
486
- log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
487
- return
488
-
489
- # TTS/LLM işlenirken audio chunk'ları tamamen yoksay
490
- if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS,
491
- ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]:
492
- log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id)
493
- return
494
-
495
- # LISTENING state'inde değilse audio işleme
496
- if session.state != ConversationState.LISTENING:
497
- log_warning(f"⚠️ Audio received in unexpected state: {session.state.value}", session_id=session.session.session_id)
498
- return
499
-
500
- # STT yoksa veya streaming değilse hata döndür
501
- if not session.stt_manager or not session.is_streaming:
502
- log_warning(f"⚠️ STT not ready, attempting to restart", session_id=session.session.session_id)
503
- await websocket.send_json({
504
- "type": "error",
505
- "error_type": "stt_not_ready",
506
- "message": "STT is not ready. Waiting for initialization..."
507
- })
508
- return
509
-
510
- # Add to buffer
511
- await session.audio_buffer.add_chunk(audio_data)
512
-
513
- # Decode for processing
514
- decoded_audio = base64.b64decode(audio_data)
515
-
516
- # Check silence
517
- silence_duration = session.silence_detector.update(decoded_audio)
518
-
519
- # Stream to STT
520
- try:
521
- # Chunk counter artır
522
- session.chunk_counter += 1
523
-
524
- if session.chunk_counter == 1:
525
- log_info(f"🎤 Started streaming audio to STT", session_id=session.session.session_id)
526
- # İlk chunk log'u - format kontrolü kaldırıldı
527
- log_info(f"📤 First chunk - size: {len(decoded_audio)} bytes", session_id=session.session.session_id)
528
- elif session.chunk_counter % 100 == 0:
529
- log_info(f"📊 Sent {session.chunk_counter} chunks to STT so far...", session_id=session.session.session_id)
530
-
531
- # STT'ye gönder ve sonuçları bekle
532
- async for result in session.stt_manager.stream_audio(decoded_audio):
533
- # SADECE FINAL RESULT'LARI İŞLE
534
- if result.is_final:
535
- log_info(f"✅ FINAL TRANSCRIPTION: '{result.text}'", session_id=session.session.session_id)
536
-
537
- # Send ONLY final transcription to frontend
538
- await websocket.send_json({
539
- "type": "transcription",
540
- "text": result.text,
541
- "is_final": True,
542
- "confidence": result.confidence
543
- })
544
-
545
- session.current_transcription = result.text
546
-
547
- # Final transcription geldiğinde STT'yi durdur ve işle
548
- if session.current_transcription:
549
- # Önce STT'yi durdur
550
- await session.stop_stt_streaming()
551
-
552
- # State'i değiştir
553
- await session.change_state(ConversationState.PROCESSING_STT)
554
-
555
- # State change mesajı gönder
556
- if session.is_websocket_active:
557
- await websocket.send_json({
558
- "type": "state_change",
559
- "from": "listening",
560
- "to": "processing_stt"
561
- })
562
-
563
- # Process user input
564
- await process_user_input(websocket, session)
565
- return
566
-
567
- except Exception as e:
568
- error_msg = str(e)
569
- # Google STT timeout hatası kontrolü
570
- if "Audio Timeout Error" in error_msg or "stream duration" in error_msg or "Exceeded maximum allowed stream duration" in error_msg:
571
- log_warning(f"⚠️ STT timeout detected, ignoring", session_id=session.session.session_id)
572
- # Timeout durumunda STT'yi yeniden başlatmaya gerek yok,
573
- # çünkü kullanıcı konuşmayı bitirdiğinde zaten yeniden başlatılacak
574
- else:
575
- log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
576
- await websocket.send_json({
577
- "type": "error",
578
- "error_type": "stt_error",
579
- "message": f"STT error: {str(e)}"
580
- })
581
-
582
- except Exception as e:
583
- log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
584
- await websocket.send_json({
585
- "type": "error",
586
- "error_type": "audio_error",
587
- "message": f"Audio processing error: {str(e)}"
588
- })
589
-
590
- # ========================= MAIN HANDLER =========================
591
- async def websocket_endpoint(websocket: WebSocket, session_id: str):
592
- """Main WebSocket endpoint for real-time conversation"""
593
- log_info(f"🔌 WebSocket connection attempt", session_id=session_id)
594
-
595
- await websocket.accept()
596
- log_info(f"✅ WebSocket accepted", session_id=session_id)
597
-
598
- # Get session
599
- session = session_store.get_session(session_id)
600
- if not session:
601
- log_error(f"❌ Session not found", session_id=session_id)
602
- await websocket.send_json({
603
- "type": "error",
604
- "message": "Session not found"
605
- })
606
- await websocket.close()
607
- return
608
-
609
- log_info(f"✅ Session found", session_id=session_id, project=session.project_name)
610
-
611
- # Mark as realtime session
612
- session.is_realtime = True
613
- session_store.update_session(session)
614
-
615
- # Initialize conversation
616
- realtime_session = RealtimeSession(session)
617
-
618
- # STT'yi burada başlatmıyoruz, welcome mesajından sonra başlatacağız
619
- log_info(f"⏳ STT initialization will be done after welcome message", session_id=session_id)
620
-
621
- # Send session started confirmation
622
- await websocket.send_json({
623
- "type": "session_started",
624
- "session_id": session_id,
625
- "stt_initialized": False
626
- })
627
-
628
- # Send welcome message from session history
629
- log_info(f"📋 Checking for welcome message in session history...", session_id=session_id)
630
-
631
- # chat_history değişkenini session'dan al
632
- chat_history = session.chat_history
633
-
634
- if chat_history and len(chat_history) > 0:
635
- log_info(f"📋 Found {len(chat_history)} messages in history", session_id=session_id)
636
-
637
- # Get the last assistant message (welcome message)
638
- for i, msg in enumerate(reversed(chat_history)):
639
- log_debug(f"📋 Message {i}: role={msg.get('role', 'unknown')}, content_preview={msg.get('content', '')[:50]}...", session_id=session_id)
640
-
641
- if msg.get('role') == 'assistant':
642
- welcome_text = msg.get('content', '')
643
- log_info(f"📢 Found welcome message: {welcome_text[:50]}...", session_id=session_id)
644
-
645
- await realtime_session.change_state(ConversationState.PLAYING_AUDIO)
646
-
647
- # Send text first
648
- try:
649
- await websocket.send_json({
650
- "type": "assistant_response",
651
- "text": welcome_text,
652
- "is_welcome": True
653
- })
654
- log_info(f"✅ Welcome text sent via WebSocket", session_id=session_id)
655
- except Exception as e:
656
- log_error(f"❌ Failed to send welcome text", error=str(e), session_id=session_id)
657
-
658
- # Generate and send TTS if available
659
- tts_provider = TTSFactory.create_provider()
660
- if tts_provider:
661
- try:
662
- log_info(f"🎤 Generating welcome TTS...", session_id=session_id)
663
-
664
- # State change bildirimi gönder
665
- await websocket.send_json({
666
- "type": "state_change",
667
- "from": "idle",
668
- "to": "playing_audio"
669
- })
670
-
671
- # TTS preprocessor kullan
672
- from tts_preprocessor import TTSPreprocessor
673
- preprocessor = TTSPreprocessor(language=session.locale)
674
- processed_text = preprocessor.preprocess(
675
- welcome_text,
676
- tts_provider.get_preprocessing_flags()
677
- )
678
-
679
- # TTS oluştur
680
- audio_data = await tts_provider.synthesize(processed_text)
681
-
682
- if audio_data:
683
- # Audio'yu base64'e çevir ve chunk'lara böl
684
- audio_base64 = base64.b64encode(audio_data).decode('utf-8')
685
- chunk_size = 16384
686
- total_length = len(audio_base64)
687
- total_chunks = (total_length + chunk_size - 1) // chunk_size
688
-
689
- log_info(f"📤 Sending welcome TTS in {total_chunks} chunks", session_id=session_id)
690
-
691
- for i in range(0, total_length, chunk_size):
692
- chunk = audio_base64[i:i + chunk_size]
693
- chunk_index = i // chunk_size
694
- is_last = chunk_index == total_chunks - 1
695
-
696
- await websocket.send_json({
697
- "type": "tts_audio",
698
- "data": chunk,
699
- "chunk_index": chunk_index,
700
- "total_chunks": total_chunks,
701
- "is_last": is_last,
702
- "mime_type": "audio/mpeg"
703
- })
704
-
705
- log_info(f"✅ Welcome TTS sent", session_id=session_id)
706
- except Exception as e:
707
- log_error(f"❌ Failed to send welcome TTS", error=str(e), traceback=traceback.format_exc(), session_id=session_id)
708
- # TTS hatası durumunda direkt listening moduna geç
709
- await realtime_session.change_state(ConversationState.LISTENING)
710
- await websocket.send_json({
711
- "type": "state_change",
712
- "from": "playing_audio",
713
- "to": "listening"
714
- })
715
- # Ve STT'yi başlat
716
- await realtime_session.initialize_stt()
717
- else:
718
- log_warning(f"⚠️ No TTS provider available", session_id=session_id)
719
- # TTS yoksa direkt listening moduna geç ve STT başlat
720
- await realtime_session.change_state(ConversationState.LISTENING)
721
- await websocket.send_json({
722
- "type": "state_change",
723
- "from": "idle",
724
- "to": "listening"
725
- })
726
- await realtime_session.initialize_stt()
727
-
728
- break
729
- else:
730
- log_warning(f"⚠️ No assistant message found in history", session_id=session_id)
731
- # Welcome mesajı yoksa direkt listening moduna geç
732
- await realtime_session.change_state(ConversationState.LISTENING)
733
- await websocket.send_json({
734
- "type": "state_change",
735
- "from": "idle",
736
- "to": "listening"
737
- })
738
- await realtime_session.initialize_stt()
739
- else:
740
- log_warning(f"⚠️ No messages in session history", session_id=session_id)
741
- # History yoksa direkt listening moduna geç
742
- await realtime_session.change_state(ConversationState.LISTENING)
743
- await websocket.send_json({
744
- "type": "state_change",
745
- "from": "idle",
746
- "to": "listening"
747
- })
748
- await realtime_session.initialize_stt()
749
-
750
- log_info(f"💬 Ready for conversation", session_id=session_id)
751
-
752
- try:
753
- while True:
754
- try:
755
- # WebSocket aktif mi kontrol et
756
- if not realtime_session.is_websocket_active:
757
- log_info(f"🔌 WebSocket inactive, breaking loop", session_id=session_id)
758
- break
759
-
760
- # Receive message with timeout
761
- message = await asyncio.wait_for(
762
- websocket.receive_json(),
763
- timeout=60.0 # 60 second timeout
764
- )
765
-
766
- message_type = message.get("type")
767
- # Debug log'u kaldırdık
768
-
769
- if message_type == "audio_chunk":
770
- await handle_audio_chunk(websocket, realtime_session, message)
771
-
772
- elif message_type == "control":
773
- await handle_control_message(websocket, realtime_session, message)
774
-
775
- elif message_type == "ping":
776
- # Keep-alive ping - log yapmadan
777
- if realtime_session.is_websocket_active:
778
- await websocket.send_json({"type": "pong"})
779
-
780
- except asyncio.TimeoutError:
781
- # Timeout log'unu da azaltalım - her timeout'ta değil
782
- if realtime_session.is_websocket_active:
783
- await websocket.send_json({"type": "ping"})
784
-
785
- except WebSocketDisconnect as e:
786
- log_info(f"🔌 WebSocket disconnected", session_id=session_id, code=e.code, reason=e.reason)
787
- except Exception as e:
788
- # WebSocket kapalıysa hata verme
789
- if "WebSocket is not connected" not in str(e) and "Cannot call \"send\"" not in str(e):
790
- log_error(
791
- f"❌ WebSocket error",
792
- error=str(e),
793
- traceback=traceback.format_exc(),
794
- session_id=session_id
795
- )
796
-
797
- # Error mesajı göndermeye çalışma, zaten kapalı olabilir
798
- if realtime_session.is_websocket_active:
799
- try:
800
- await websocket.send_json({
801
- "type": "error",
802
- "message": str(e)
803
- })
804
- except:
805
- pass
806
- finally:
807
- log_info(f"🧹 Cleaning up WebSocket connection", session_id=session_id)
808
- await realtime_session.cleanup()
809
-
810
- # WebSocket'in açık olup olmadığını kontrol et
811
- try:
812
- if websocket.client_state.value == 1: # 1 = CONNECTED state
813
- await websocket.close()
814
- except Exception as e:
815
- log_debug(f"WebSocket already closed or error during close: {e}", session_id=session_id)
816
-
817
-
818
- # ========================= PROCESSING FUNCTIONS =========================
819
- async def process_user_input(websocket: WebSocket, session: RealtimeSession):
820
- """Process complete user input"""
821
- try:
822
- # Transcription'ı hemen sakla - stop_stt_streaming'den önce!
823
- user_text = session.current_transcription
824
-
825
- # LLM işlemesi başlamadan önce STT'nin tamamen durduğundan emin ol
826
- await session.stop_stt_streaming()
827
-
828
- # WebSocket aktif mi kontrol et
829
- if not session.is_websocket_active:
830
- return
831
-
832
- user_text = session.current_transcription
833
- if not user_text:
834
- log_warning(f"⚠️ Empty transcription, returning to listening", session_id=session.session.session_id)
835
- # Boş transcription durumunda listening'e dön ve STT'yi yeniden başlat
836
- await session.change_state(ConversationState.LISTENING)
837
- await session.audio_buffer.clear()
838
- await session.reset_for_new_utterance()
839
- await session.restart_stt_if_needed()
840
- return
841
-
842
- log_info(f"🎯 Processing user input", text=user_text, session_id=session.session.session_id)
843
-
844
- # Send final transcription
845
- if session.is_websocket_active:
846
- await websocket.send_json({
847
- "type": "transcription",
848
- "text": user_text,
849
- "is_final": True,
850
- "confidence": 0.95
851
- })
852
-
853
- # State: LLM Processing
854
- await session.change_state(ConversationState.PROCESSING_LLM)
855
- if session.is_websocket_active:
856
- await websocket.send_json({
857
- "type": "state_change",
858
- "from": "processing_stt",
859
- "to": "processing_llm"
860
- })
861
-
862
- # Add to chat history
863
- session.session.add_message("user", user_text)
864
-
865
- # Get LLM response based on session state
866
- log_info(f"🤖 Getting LLM response", session_state=session.session.state, session_id=session.session.session_id)
867
-
868
- if session.session.state == "collect_params":
869
- response_text = await handle_parameter_followup(session.session, user_text)
870
- else:
871
- response_text = await handle_new_message(session.session, user_text)
872
-
873
- log_info(f"💬 LLM response: {response_text[:50]}...", session_id=session.session.session_id)
874
-
875
- # Add response to history
876
- session.session.add_message("assistant", response_text)
877
-
878
- # Send text response
879
- if session.is_websocket_active:
880
- await websocket.send_json({
881
- "type": "assistant_response",
882
- "text": response_text
883
- })
884
-
885
- # Generate TTS if enabled
886
- tts_provider = TTSFactory.create_provider()
887
- log_info(f"🔍 TTS provider check: {tts_provider is not None}", session_id=session.session.session_id)
888
-
889
- if tts_provider and session.is_websocket_active:
890
- await session.change_state(ConversationState.PROCESSING_TTS)
891
- if session.is_websocket_active:
892
- await websocket.send_json({
893
- "type": "state_change",
894
- "from": "processing_llm",
895
- "to": "processing_tts"
896
- })
897
-
898
- log_info(f"🎵 Starting TTS generation for response", session_id=session.session.session_id)
899
-
900
- # Generate TTS (barge-in devre dışı)
901
- await generate_and_stream_tts(websocket, session, tts_provider, response_text)
902
-
903
- # TTS bittikten sonra LISTENING state'ine geç
904
- await session.change_state(ConversationState.LISTENING)
905
- if session.is_websocket_active:
906
- await websocket.send_json({
907
- "type": "state_change",
908
- "from": "playing_audio",
909
- "to": "listening"
910
- })
911
-
912
- # STT'yi yeniden başlat
913
- log_info(f"🔄 Restarting STT after TTS completion", session_id=session.session.session_id)
914
- await session.restart_stt_if_needed()
915
-
916
- else:
917
- log_info(f"⚠️ No TTS provider or WebSocket inactive", session_id=session.session.session_id)
918
- # No TTS, go back to listening and restart STT
919
- await session.change_state(ConversationState.LISTENING)
920
- if session.is_websocket_active:
921
- await websocket.send_json({
922
- "type": "state_change",
923
- "from": "processing_llm",
924
- "to": "listening"
925
- })
926
- await session.restart_stt_if_needed()
927
-
928
- except Exception as e:
929
- log_error(
930
- f"❌ Error processing user input",
931
- error=str(e),
932
- traceback=traceback.format_exc(),
933
- session_id=session.session.session_id
934
- )
935
- if session.is_websocket_active:
936
- await websocket.send_json({
937
- "type": "error",
938
- "message": f"Processing error: {str(e)}"
939
- })
940
- await session.reset_for_new_utterance()
941
- # Hata durumunda listening'e dön ve STT'yi yeniden başlat
942
- await session.change_state(ConversationState.LISTENING)
943
- await session.restart_stt_if_needed()
944
-
945
- async def generate_and_stream_tts(
946
- websocket: WebSocket,
947
- session: RealtimeSession,
948
- tts_provider,
949
- text: str
950
- ):
951
- """Generate and stream TTS audio with sequential processing"""
952
- try:
953
- # TTS başlamadan önce STT'nin tamamen durduğundan emin ol
954
- await session.stop_stt_streaming()
955
-
956
- log_info(f"🎤 Starting TTS generation for text: '{text[:50]}...'", session_id=session.session.session_id)
957
-
958
- # TTS preprocessor kullan
959
- from tts_preprocessor import TTSPreprocessor
960
- preprocessor = TTSPreprocessor(language=session.session.locale)
961
- processed_text = preprocessor.preprocess(
962
- text,
963
- tts_provider.get_preprocessing_flags()
964
- )
965
-
966
- log_debug(f"📝 Preprocessed text: '{processed_text[:50]}...'", session_id=session.session.session_id)
967
-
968
- # Generate audio
969
- audio_data = await tts_provider.synthesize(processed_text)
970
- log_info(f"✅ TTS generated: {len(audio_data)} bytes, type: {type(audio_data)}", session_id=session.session.session_id)
971
-
972
- # WebSocket aktif mi kontrol et
973
- if not session.is_websocket_active:
974
- log_warning(f"⚠️ WebSocket inactive, skipping TTS streaming", session_id=session.session.session_id)
975
- return
976
-
977
- # Change state to playing
978
- await session.change_state(ConversationState.PLAYING_AUDIO)
979
- if session.is_websocket_active:
980
- await websocket.send_json({
981
- "type": "state_change",
982
- "from": "processing_tts",
983
- "to": "playing_audio"
984
- })
985
-
986
- # Convert entire audio to base64 for transmission
987
- import base64
988
- log_debug(f"📦 Converting audio to base64...")
989
- audio_base64 = base64.b64encode(audio_data).decode('utf-8')
990
- log_info(f"📊 Base64 conversion complete: {len(audio_base64)} chars from {len(audio_data)} bytes", session_id=session.session.session_id)
991
-
992
- # Log first 100 chars of base64 to verify it's valid
993
- log_debug(f"🔍 Base64 preview: {audio_base64[:100]}...")
994
-
995
- # Stream audio in chunks
996
- chunk_size = 16384 # Larger chunk size for base64
997
- total_length = len(audio_base64)
998
- total_chunks = (total_length + chunk_size - 1) // chunk_size
999
-
1000
- log_info(f"📤 Streaming TTS audio: {len(audio_data)} bytes as {total_length} base64 chars in {total_chunks} chunks", session_id=session.session.session_id)
1001
-
1002
- for i in range(0, total_length, chunk_size):
1003
- # WebSocket aktif mi kontrol et
1004
- if not session.is_websocket_active:
1005
- log_warning(f"⚠️ WebSocket inactive during streaming, stopping", session_id=session.session.session_id)
1006
- break
1007
-
1008
- chunk = audio_base64[i:i + chunk_size]
1009
- chunk_index = i // chunk_size
1010
- is_last = chunk_index == total_chunks - 1
1011
-
1012
- log_debug(f"📨 Sending chunk {chunk_index}/{total_chunks}, size: {len(chunk)}, is_last: {is_last}")
1013
-
1014
- if session.is_websocket_active:
1015
- await websocket.send_json({
1016
- "type": "tts_audio",
1017
- "data": chunk,
1018
- "chunk_index": chunk_index,
1019
- "total_chunks": total_chunks,
1020
- "is_last": is_last,
1021
- "mime_type": "audio/mpeg"
1022
- })
1023
-
1024
- # Small delay to prevent overwhelming the client
1025
- await asyncio.sleep(0.01)
1026
-
1027
- log_info(
1028
- f"✅ TTS streaming completed successfully",
1029
- session_id=session.session.session_id,
1030
- text_length=len(text),
1031
- audio_size=len(audio_data),
1032
- chunks_sent=total_chunks
1033
- )
1034
-
1035
- # TTS bitimi - state değişimi process_user_input'ta yapılacak
1036
-
1037
- except Exception as e:
1038
- error_msg = str(e)
1039
- log_error(
1040
- f"❌ TTS generation error",
1041
- error=error_msg,
1042
- traceback=traceback.format_exc(),
1043
- session_id=session.session.session_id
1044
- )
1045
-
1046
- # Quota hatası için özel handling
1047
- if "quota_exceeded" in error_msg:
1048
- if session.is_websocket_active:
1049
- await websocket.send_json({
1050
- "type": "tts_error",
1051
- "message": "TTS servisinin kredi limiti aşıldı. Yanıt sadece metin olarak gösterilecek.",
1052
- "error_type": "quota_exceeded"
1053
- })
1054
- else:
1055
- if session.is_websocket_active:
1056
- await websocket.send_json({
1057
- "type": "error",
1058
- "message": f"TTS error: {error_msg}"
1059
- })
1060
-
1061
- # TTS hatası durumunda listening'e dön
1062
- await session.change_state(ConversationState.LISTENING)
1063
- if session.is_websocket_active:
1064
- await websocket.send_json({
1065
- "type": "state_change",
1066
- "from": "processing_tts",
1067
- "to": "listening"
1068
- })
1069
- # STT'yi yeniden başlat
1070
- await session.restart_stt_if_needed()