Spaces:
Building
Building
Delete websocket_handler.py
Browse files- websocket_handler.py +0 -1070
websocket_handler.py
DELETED
@@ -1,1070 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
WebSocket Handler for Real-time STT/TTS with Barge-in Support
|
3 |
-
"""
|
4 |
-
from fastapi import WebSocket, WebSocketDisconnect
|
5 |
-
from typing import Dict, Any, Optional
|
6 |
-
import json
|
7 |
-
import asyncio
|
8 |
-
import base64
|
9 |
-
from datetime import datetime
|
10 |
-
from collections import deque
|
11 |
-
from enum import Enum
|
12 |
-
import numpy as np
|
13 |
-
import traceback
|
14 |
-
|
15 |
-
from session import Session, session_store
|
16 |
-
from config_provider import ConfigProvider
|
17 |
-
from chat_handler import handle_new_message, handle_parameter_followup
|
18 |
-
from stt_factory import STTFactory
|
19 |
-
from tts_factory import TTSFactory
|
20 |
-
from logger import log_info, log_error, log_debug, log_warning
|
21 |
-
|
22 |
-
# ========================= CONSTANTS =========================
|
23 |
-
# Default values - will be overridden by config
|
24 |
-
DEFAULT_SILENCE_THRESHOLD_MS = 2000
|
25 |
-
DEFAULT_AUDIO_CHUNK_SIZE = 4096
|
26 |
-
DEFAULT_ENERGY_THRESHOLD = 0.0005 # 0.01
|
27 |
-
DEFAULT_AUDIO_BUFFER_MAX_SIZE = 1000
|
28 |
-
|
29 |
-
# ========================= ENUMS =========================
|
30 |
-
class ConversationState(Enum):
|
31 |
-
IDLE = "idle"
|
32 |
-
LISTENING = "listening"
|
33 |
-
PROCESSING_STT = "processing_stt"
|
34 |
-
PROCESSING_LLM = "processing_llm"
|
35 |
-
PROCESSING_TTS = "processing_tts"
|
36 |
-
PLAYING_AUDIO = "playing_audio"
|
37 |
-
|
38 |
-
# ========================= CLASSES =========================
|
39 |
-
class AudioBuffer:
|
40 |
-
"""Thread-safe circular buffer for audio chunks"""
|
41 |
-
def __init__(self, max_size: int = DEFAULT_AUDIO_BUFFER_MAX_SIZE):
|
42 |
-
self.buffer = deque(maxlen=max_size)
|
43 |
-
self.lock = asyncio.Lock()
|
44 |
-
|
45 |
-
async def add_chunk(self, chunk_data: str):
|
46 |
-
"""Add base64 encoded audio chunk"""
|
47 |
-
async with self.lock:
|
48 |
-
decoded = base64.b64decode(chunk_data)
|
49 |
-
self.buffer.append(decoded)
|
50 |
-
|
51 |
-
async def get_all_audio(self) -> bytes:
|
52 |
-
"""Get all audio data concatenated"""
|
53 |
-
async with self.lock:
|
54 |
-
return b''.join(self.buffer)
|
55 |
-
|
56 |
-
async def clear(self):
|
57 |
-
"""Clear buffer"""
|
58 |
-
async with self.lock:
|
59 |
-
self.buffer.clear()
|
60 |
-
|
61 |
-
def size(self) -> int:
|
62 |
-
"""Get current buffer size"""
|
63 |
-
return len(self.buffer)
|
64 |
-
|
65 |
-
|
66 |
-
class SilenceDetector:
|
67 |
-
"""Detect silence in audio stream"""
|
68 |
-
def __init__(self, threshold_ms: int = DEFAULT_SILENCE_THRESHOLD_MS, energy_threshold: float = DEFAULT_ENERGY_THRESHOLD):
|
69 |
-
self.threshold_ms = threshold_ms
|
70 |
-
self.energy_threshold = energy_threshold
|
71 |
-
self.silence_start = None
|
72 |
-
self.sample_rate = 16000
|
73 |
-
|
74 |
-
def update(self, audio_chunk: bytes) -> int:
|
75 |
-
"""Update with new audio chunk and return silence duration in ms"""
|
76 |
-
if self.is_silence(audio_chunk):
|
77 |
-
if self.silence_start is None:
|
78 |
-
self.silence_start = datetime.now()
|
79 |
-
silence_duration = (datetime.now() - self.silence_start).total_seconds() * 1000
|
80 |
-
return int(silence_duration)
|
81 |
-
else:
|
82 |
-
self.silence_start = None
|
83 |
-
return 0
|
84 |
-
|
85 |
-
def is_silence(self, audio_chunk: bytes) -> bool:
|
86 |
-
"""Check if audio chunk is silence"""
|
87 |
-
try:
|
88 |
-
# Audio chunk boyutunu kontrol et
|
89 |
-
if len(audio_chunk) == 0:
|
90 |
-
return True
|
91 |
-
|
92 |
-
# Chunk boyutu 2'nin katı olmalı (16-bit audio için)
|
93 |
-
if len(audio_chunk) % 2 != 0:
|
94 |
-
# Tek byte varsa, son byte'ı at
|
95 |
-
audio_chunk = audio_chunk[:-1]
|
96 |
-
|
97 |
-
# Convert bytes to numpy array (assuming 16-bit PCM)
|
98 |
-
audio_data = np.frombuffer(audio_chunk, dtype=np.int16)
|
99 |
-
|
100 |
-
# RMS hesapla
|
101 |
-
if len(audio_data) == 0:
|
102 |
-
return True
|
103 |
-
|
104 |
-
rms = np.sqrt(np.mean(audio_data.astype(float) ** 2))
|
105 |
-
normalized_rms = rms / 32768.0
|
106 |
-
|
107 |
-
return normalized_rms < self.energy_threshold
|
108 |
-
|
109 |
-
except Exception as e:
|
110 |
-
log_warning(f"Silence detection error: {e}")
|
111 |
-
return False
|
112 |
-
|
113 |
-
def reset(self):
|
114 |
-
"""Reset silence detection"""
|
115 |
-
self.silence_start = None
|
116 |
-
|
117 |
-
|
118 |
-
class BargeInHandler:
|
119 |
-
"""Handle user interruptions during TTS playback"""
|
120 |
-
def __init__(self):
|
121 |
-
self.active_tts_task: Optional[asyncio.Task] = None
|
122 |
-
self.is_interrupting = False
|
123 |
-
self.lock = asyncio.Lock()
|
124 |
-
|
125 |
-
async def start_tts_task(self, coro):
|
126 |
-
"""Start a cancellable TTS task"""
|
127 |
-
async with self.lock:
|
128 |
-
# Cancel any existing task
|
129 |
-
if self.active_tts_task and not self.active_tts_task.done():
|
130 |
-
self.active_tts_task.cancel()
|
131 |
-
try:
|
132 |
-
await self.active_tts_task
|
133 |
-
except asyncio.CancelledError:
|
134 |
-
pass
|
135 |
-
|
136 |
-
# Start new task
|
137 |
-
self.active_tts_task = asyncio.create_task(coro)
|
138 |
-
return self.active_tts_task
|
139 |
-
|
140 |
-
async def handle_interruption(self, current_state: ConversationState):
|
141 |
-
"""Handle barge-in interruption"""
|
142 |
-
async with self.lock:
|
143 |
-
self.is_interrupting = True
|
144 |
-
|
145 |
-
# Cancel TTS if active
|
146 |
-
if self.active_tts_task and not self.active_tts_task.done():
|
147 |
-
log_info("Barge-in: Cancelling active TTS")
|
148 |
-
self.active_tts_task.cancel()
|
149 |
-
try:
|
150 |
-
await self.active_tts_task
|
151 |
-
except asyncio.CancelledError:
|
152 |
-
pass
|
153 |
-
|
154 |
-
# Reset flag after short delay
|
155 |
-
await asyncio.sleep(0.5)
|
156 |
-
self.is_interrupting = False
|
157 |
-
|
158 |
-
|
159 |
-
class RealtimeSession:
|
160 |
-
"""Manage a real-time conversation session"""
|
161 |
-
def __init__(self, session: Session):
|
162 |
-
self.session = session
|
163 |
-
self.state = ConversationState.IDLE
|
164 |
-
self.is_websocket_active = True
|
165 |
-
|
166 |
-
# Get settings from config
|
167 |
-
config = ConfigProvider.get().global_config.stt_provider.settings
|
168 |
-
|
169 |
-
# Initialize with config values or defaults
|
170 |
-
silence_threshold = config.get("speech_timeout_ms", DEFAULT_SILENCE_THRESHOLD_MS)
|
171 |
-
energy_threshold = config.get("energy_threshold", DEFAULT_ENERGY_THRESHOLD)
|
172 |
-
buffer_max_size = config.get("audio_buffer_max_size", DEFAULT_AUDIO_BUFFER_MAX_SIZE)
|
173 |
-
|
174 |
-
self.audio_buffer = AudioBuffer(max_size=buffer_max_size)
|
175 |
-
self.silence_detector = SilenceDetector(
|
176 |
-
threshold_ms=silence_threshold,
|
177 |
-
energy_threshold=energy_threshold
|
178 |
-
)
|
179 |
-
self.barge_in_handler = BargeInHandler()
|
180 |
-
self.stt_manager = None
|
181 |
-
self.current_transcription = ""
|
182 |
-
self.is_streaming = False
|
183 |
-
self.lock = asyncio.Lock()
|
184 |
-
|
185 |
-
# Store config for later use
|
186 |
-
self.audio_chunk_size = config.get("audio_chunk_size", DEFAULT_AUDIO_CHUNK_SIZE)
|
187 |
-
self.silence_threshold_ms = silence_threshold
|
188 |
-
|
189 |
-
# Chunk counter için attribute
|
190 |
-
self.chunk_counter = 0
|
191 |
-
|
192 |
-
# Session management - YENİ
|
193 |
-
self.stt_session_count = 0
|
194 |
-
self.last_stt_stop_time = None
|
195 |
-
|
196 |
-
async def initialize_stt(self):
|
197 |
-
"""Initialize STT provider with clean state"""
|
198 |
-
try:
|
199 |
-
# Session numarasını artır
|
200 |
-
self.stt_session_count += 1
|
201 |
-
log_info(f"🎤 Initializing STT session #{self.stt_session_count}", session_id=self.session.session_id)
|
202 |
-
|
203 |
-
# Önce mevcut STT'yi tamamen temizle
|
204 |
-
await self.stop_stt_streaming()
|
205 |
-
|
206 |
-
# Önceki stop'tan bu yana yeterli zaman geçtiğinden emin ol
|
207 |
-
if self.last_stt_stop_time:
|
208 |
-
elapsed = (datetime.now() - self.last_stt_stop_time).total_seconds()
|
209 |
-
if elapsed < 0.5:
|
210 |
-
wait_time = 0.5 - elapsed
|
211 |
-
log_info(f"⏳ Waiting {wait_time:.2f}s for proper cleanup", session_id=self.session.session_id)
|
212 |
-
await asyncio.sleep(wait_time)
|
213 |
-
|
214 |
-
# Tüm değişkenleri yeniden başlat
|
215 |
-
self.chunk_counter = 0
|
216 |
-
self.current_transcription = ""
|
217 |
-
await self.audio_buffer.clear()
|
218 |
-
self.silence_detector.reset()
|
219 |
-
|
220 |
-
# Yeni STT instance oluştur
|
221 |
-
self.stt_manager = STTFactory.create_provider()
|
222 |
-
if not self.stt_manager:
|
223 |
-
log_error("❌ STT manager is None - STTFactory.create_provider() returned None", session_id=self.session.session_id)
|
224 |
-
return False
|
225 |
-
|
226 |
-
log_info(f"✅ STT manager created: {type(self.stt_manager).__name__}", session_id=self.session.session_id)
|
227 |
-
|
228 |
-
# Get STT config from provider settings
|
229 |
-
config = ConfigProvider.get().global_config.stt_provider.settings
|
230 |
-
|
231 |
-
# Get language from session locale
|
232 |
-
session_locale = getattr(self.session, 'locale', 'tr')
|
233 |
-
|
234 |
-
# Import LocaleManager to get proper locale tag
|
235 |
-
from locale_manager import LocaleManager
|
236 |
-
locale_data = LocaleManager.get_locale(session_locale)
|
237 |
-
|
238 |
-
# Get proper locale tag for STT (e.g., tr -> tr-TR)
|
239 |
-
language_code = locale_data.get('locale_tag', 'tr-TR')
|
240 |
-
|
241 |
-
log_info(f"🌍 Session locale: {session_locale}, STT language: {language_code}", session_id=self.session.session_id)
|
242 |
-
|
243 |
-
# single_utterance'ı false yap - sürekli dinleme için
|
244 |
-
stt_config = {
|
245 |
-
"language": language_code,
|
246 |
-
"interim_results": config.get("interim_results", True),
|
247 |
-
"single_utterance": False, # Sürekli dinleme için false
|
248 |
-
"enable_punctuation": config.get("enable_punctuation", True),
|
249 |
-
"sample_rate": 16000,
|
250 |
-
"encoding": "WEBM_OPUS"
|
251 |
-
}
|
252 |
-
|
253 |
-
log_info(f"🎤 Starting STT streaming with config: {stt_config}", session_id=self.session.session_id)
|
254 |
-
|
255 |
-
# Start streaming
|
256 |
-
await self.stt_manager.start_streaming(stt_config)
|
257 |
-
self.is_streaming = True
|
258 |
-
|
259 |
-
log_info("✅ STT streaming started successfully with clean state", session_id=self.session.session_id)
|
260 |
-
return True
|
261 |
-
|
262 |
-
except Exception as e:
|
263 |
-
log_error(f"❌ Failed to initialize STT", error=str(e), traceback=traceback.format_exc(), session_id=self.session.session_id)
|
264 |
-
# Hata durumunda da temizlik yap
|
265 |
-
await self.stop_stt_streaming()
|
266 |
-
return False
|
267 |
-
|
268 |
-
async def stop_stt_streaming(self):
|
269 |
-
"""Stop STT streaming completely and reset all STT-related variables"""
|
270 |
-
try:
|
271 |
-
log_info(f"🛑 Stopping STT session #{self.stt_session_count}", session_id=self.session.session_id)
|
272 |
-
|
273 |
-
# STT manager varsa durdur
|
274 |
-
if self.stt_manager:
|
275 |
-
if self.is_streaming:
|
276 |
-
try:
|
277 |
-
await self.stt_manager.stop_streaming()
|
278 |
-
except Exception as e:
|
279 |
-
log_warning(f"⚠️ Error during STT stop_streaming: {e}", session_id=self.session.session_id)
|
280 |
-
|
281 |
-
# STT manager'ı tamamen sil
|
282 |
-
self.stt_manager = None
|
283 |
-
|
284 |
-
# Tüm STT ile ilgili değişkenleri resetle
|
285 |
-
self.is_streaming = False
|
286 |
-
self.chunk_counter = 0
|
287 |
-
|
288 |
-
# Audio buffer'ı temizle
|
289 |
-
await self.audio_buffer.clear()
|
290 |
-
|
291 |
-
# Silence detector'ı resetle
|
292 |
-
self.silence_detector.reset()
|
293 |
-
|
294 |
-
# Speech started flag'ini temizle
|
295 |
-
if hasattr(self, 'speech_started'):
|
296 |
-
delattr(self, 'speech_started')
|
297 |
-
|
298 |
-
# Stop zamanını kaydet
|
299 |
-
self.last_stt_stop_time = datetime.now()
|
300 |
-
|
301 |
-
log_info(f"✅ STT session #{self.stt_session_count} stopped and all data reset", session_id=self.session.session_id)
|
302 |
-
|
303 |
-
except Exception as e:
|
304 |
-
log_error(f"❌ Error in stop_stt_streaming", error=str(e), session_id=self.session.session_id)
|
305 |
-
# Hata olsa bile değişkenleri resetle
|
306 |
-
self.stt_manager = None
|
307 |
-
self.is_streaming = False
|
308 |
-
self.chunk_counter = 0
|
309 |
-
if self.audio_buffer:
|
310 |
-
await self.audio_buffer.clear()
|
311 |
-
if self.silence_detector:
|
312 |
-
self.silence_detector.reset()
|
313 |
-
self.last_stt_stop_time = datetime.now()
|
314 |
-
|
315 |
-
async def restart_stt_if_needed(self):
|
316 |
-
"""Restart STT streaming if needed"""
|
317 |
-
try:
|
318 |
-
# STT yoksa baştan oluştur
|
319 |
-
if not self.stt_manager:
|
320 |
-
await self.create_stt_manager()
|
321 |
-
if not self.stt_manager:
|
322 |
-
log_error(f"❌ Failed to create STT manager", session_id=self.session.session_id)
|
323 |
-
return False
|
324 |
-
|
325 |
-
# Streaming başlat
|
326 |
-
config = ConfigProvider.get().global_config.stt_provider.settings
|
327 |
-
stt_config = {
|
328 |
-
'language': self.get_stt_language(),
|
329 |
-
'interim_results': True,
|
330 |
-
'single_utterance': False, # Continuous listening için False
|
331 |
-
'enable_punctuation': True,
|
332 |
-
'sample_rate': 16000,
|
333 |
-
'encoding': 'LINEAR16' # WEBM_OPUS yerine LINEAR16 kullan
|
334 |
-
}
|
335 |
-
|
336 |
-
await self.stt_manager.start_streaming(stt_config)
|
337 |
-
self.is_streaming = True
|
338 |
-
|
339 |
-
log_info(f"✅ STT streaming started successfully with clean state", session_id=self.session.session_id)
|
340 |
-
return True
|
341 |
-
|
342 |
-
except Exception as e:
|
343 |
-
log_error(f"❌ Failed to restart STT", error=str(e), traceback=traceback.format_exc(), session_id=self.session.session_id)
|
344 |
-
self.is_streaming = False
|
345 |
-
return False
|
346 |
-
|
347 |
-
async def change_state(self, new_state: ConversationState):
|
348 |
-
"""Change conversation state"""
|
349 |
-
async with self.lock:
|
350 |
-
old_state = self.state
|
351 |
-
self.state = new_state
|
352 |
-
log_debug(
|
353 |
-
f"State change: {old_state.value} → {new_state.value}",
|
354 |
-
session_id=self.session.session_id
|
355 |
-
)
|
356 |
-
|
357 |
-
async def handle_barge_in(self):
|
358 |
-
"""Handle user interruption"""
|
359 |
-
# Barge-in devre dışı - bu metod artık çağrılmamalı
|
360 |
-
log_warning(f"⚠️ Barge-in called but disabled", session_id=self.session.session_id)
|
361 |
-
return
|
362 |
-
|
363 |
-
async def reset_for_new_utterance(self):
|
364 |
-
"""Reset for new user utterance"""
|
365 |
-
log_info(f"🔄 Resetting for new utterance", session_id=self.session.session_id)
|
366 |
-
|
367 |
-
# Buffer ve detector'ı temizle
|
368 |
-
await self.audio_buffer.clear()
|
369 |
-
self.silence_detector.reset()
|
370 |
-
|
371 |
-
# Transcription ve counter'ı sıfırla
|
372 |
-
self.current_transcription = ""
|
373 |
-
self.chunk_counter = 0
|
374 |
-
|
375 |
-
# Speech started flag'ini temizle
|
376 |
-
if hasattr(self, 'speech_started'):
|
377 |
-
delattr(self, 'speech_started')
|
378 |
-
|
379 |
-
log_info(f"✅ Reset for new utterance complete", session_id=self.session.session_id)
|
380 |
-
|
381 |
-
async def cleanup(self):
|
382 |
-
"""Clean up resources"""
|
383 |
-
try:
|
384 |
-
self.is_websocket_active = False
|
385 |
-
await self.stop_stt_streaming() # STT'yi düzgün durdur
|
386 |
-
log_info(f"Cleaned up realtime session", session_id=self.session.session_id)
|
387 |
-
except Exception as e:
|
388 |
-
log_warning(f"Cleanup error", error=str(e), session_id=self.session.session_id)
|
389 |
-
|
390 |
-
# ========================= MESSAGE HANDLERS =========================
|
391 |
-
async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
392 |
-
"""Handle control messages"""
|
393 |
-
action = message.get("action")
|
394 |
-
config = message.get("config", {})
|
395 |
-
|
396 |
-
log_debug(f"🎮 Control message", action=action, session_id=session.session.session_id)
|
397 |
-
|
398 |
-
if action == "start_session":
|
399 |
-
# Session configuration
|
400 |
-
await websocket.send_json({
|
401 |
-
"type": "session_config",
|
402 |
-
"session_id": session.session.session_id,
|
403 |
-
"config": {
|
404 |
-
"silence_threshold_ms": session.silence_threshold_ms,
|
405 |
-
"audio_chunk_size": session.audio_chunk_size,
|
406 |
-
"supports_barge_in": False # Barge-in devre dışı
|
407 |
-
}
|
408 |
-
})
|
409 |
-
|
410 |
-
elif action == "end_session" or action == "stop_session":
|
411 |
-
# Clean up and close
|
412 |
-
await session.cleanup()
|
413 |
-
await websocket.close()
|
414 |
-
|
415 |
-
elif action == "interrupt":
|
416 |
-
# Barge-in devre dışı - ignore
|
417 |
-
log_warning(f"⚠️ Interrupt request ignored (barge-in disabled)", session_id=session.session.session_id)
|
418 |
-
|
419 |
-
elif action == "reset":
|
420 |
-
# Reset conversation state
|
421 |
-
await session.reset_for_new_utterance()
|
422 |
-
await session.stop_stt_streaming()
|
423 |
-
await session.change_state(ConversationState.IDLE)
|
424 |
-
await websocket.send_json({
|
425 |
-
"type": "state_change",
|
426 |
-
"from": session.state.value,
|
427 |
-
"to": "idle"
|
428 |
-
})
|
429 |
-
|
430 |
-
elif action == "audio_ended":
|
431 |
-
# Audio playback ended on client
|
432 |
-
log_info(f"🎵 Client reported audio ended, current state: {session.state.value}", session_id=session.session.session_id)
|
433 |
-
|
434 |
-
if session.state == ConversationState.PLAYING_AUDIO:
|
435 |
-
# State'i listening'e çevir
|
436 |
-
await session.change_state(ConversationState.LISTENING)
|
437 |
-
await websocket.send_json({
|
438 |
-
"type": "state_change",
|
439 |
-
"from": "playing_audio",
|
440 |
-
"to": "listening"
|
441 |
-
})
|
442 |
-
|
443 |
-
# STT'yi başlat
|
444 |
-
log_info(f"🎤 Starting STT after audio playback ended", session_id=session.session.session_id)
|
445 |
-
|
446 |
-
# Önce mevcut STT varsa temizle
|
447 |
-
if session.stt_manager:
|
448 |
-
await session.stop_stt_streaming()
|
449 |
-
await asyncio.sleep(0.1) # Kısa bekleme
|
450 |
-
|
451 |
-
# Yeni STT başlat
|
452 |
-
success = await session.initialize_stt()
|
453 |
-
|
454 |
-
# STT hazır olduğunda sinyal gönder
|
455 |
-
if success and session.is_streaming:
|
456 |
-
log_info(f"✅ Sending STT ready signal", session_id=session.session.session_id)
|
457 |
-
await websocket.send_json({
|
458 |
-
"type": "stt_ready",
|
459 |
-
"message": "STT is ready to receive audio"
|
460 |
-
})
|
461 |
-
else:
|
462 |
-
log_error(f"❌ STT initialization failed", session_id=session.session.session_id)
|
463 |
-
await websocket.send_json({
|
464 |
-
"type": "error",
|
465 |
-
"error_type": "stt_init_failed",
|
466 |
-
"message": "Failed to initialize STT after audio playback"
|
467 |
-
})
|
468 |
-
else:
|
469 |
-
log_warning(f"⚠️ audio_ended received but state is not playing_audio: {session.state.value}", session_id=session.session.session_id)
|
470 |
-
|
471 |
-
elif action == "restart_stt":
|
472 |
-
# Manual STT restart request
|
473 |
-
log_info(f"🔄 Manual STT restart requested", session_id=session.session.session_id)
|
474 |
-
await session.stop_stt_streaming()
|
475 |
-
await session.restart_stt_if_needed()
|
476 |
-
|
477 |
-
async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]):
|
478 |
-
"""Handle incoming audio chunk with sequential processing"""
|
479 |
-
try:
|
480 |
-
# WebSocket kapandıysa işlem yapma
|
481 |
-
if not session.is_websocket_active:
|
482 |
-
return
|
483 |
-
|
484 |
-
audio_data = message.get("data")
|
485 |
-
if not audio_data:
|
486 |
-
log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id)
|
487 |
-
return
|
488 |
-
|
489 |
-
# TTS/LLM işlenirken audio chunk'ları tamamen yoksay
|
490 |
-
if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS,
|
491 |
-
ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]:
|
492 |
-
log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id)
|
493 |
-
return
|
494 |
-
|
495 |
-
# LISTENING state'inde değilse audio işleme
|
496 |
-
if session.state != ConversationState.LISTENING:
|
497 |
-
log_warning(f"⚠️ Audio received in unexpected state: {session.state.value}", session_id=session.session.session_id)
|
498 |
-
return
|
499 |
-
|
500 |
-
# STT yoksa veya streaming değilse hata döndür
|
501 |
-
if not session.stt_manager or not session.is_streaming:
|
502 |
-
log_warning(f"⚠️ STT not ready, attempting to restart", session_id=session.session.session_id)
|
503 |
-
await websocket.send_json({
|
504 |
-
"type": "error",
|
505 |
-
"error_type": "stt_not_ready",
|
506 |
-
"message": "STT is not ready. Waiting for initialization..."
|
507 |
-
})
|
508 |
-
return
|
509 |
-
|
510 |
-
# Add to buffer
|
511 |
-
await session.audio_buffer.add_chunk(audio_data)
|
512 |
-
|
513 |
-
# Decode for processing
|
514 |
-
decoded_audio = base64.b64decode(audio_data)
|
515 |
-
|
516 |
-
# Check silence
|
517 |
-
silence_duration = session.silence_detector.update(decoded_audio)
|
518 |
-
|
519 |
-
# Stream to STT
|
520 |
-
try:
|
521 |
-
# Chunk counter artır
|
522 |
-
session.chunk_counter += 1
|
523 |
-
|
524 |
-
if session.chunk_counter == 1:
|
525 |
-
log_info(f"🎤 Started streaming audio to STT", session_id=session.session.session_id)
|
526 |
-
# İlk chunk log'u - format kontrolü kaldırıldı
|
527 |
-
log_info(f"📤 First chunk - size: {len(decoded_audio)} bytes", session_id=session.session.session_id)
|
528 |
-
elif session.chunk_counter % 100 == 0:
|
529 |
-
log_info(f"📊 Sent {session.chunk_counter} chunks to STT so far...", session_id=session.session.session_id)
|
530 |
-
|
531 |
-
# STT'ye gönder ve sonuçları bekle
|
532 |
-
async for result in session.stt_manager.stream_audio(decoded_audio):
|
533 |
-
# SADECE FINAL RESULT'LARI İŞLE
|
534 |
-
if result.is_final:
|
535 |
-
log_info(f"✅ FINAL TRANSCRIPTION: '{result.text}'", session_id=session.session.session_id)
|
536 |
-
|
537 |
-
# Send ONLY final transcription to frontend
|
538 |
-
await websocket.send_json({
|
539 |
-
"type": "transcription",
|
540 |
-
"text": result.text,
|
541 |
-
"is_final": True,
|
542 |
-
"confidence": result.confidence
|
543 |
-
})
|
544 |
-
|
545 |
-
session.current_transcription = result.text
|
546 |
-
|
547 |
-
# Final transcription geldiğinde STT'yi durdur ve işle
|
548 |
-
if session.current_transcription:
|
549 |
-
# Önce STT'yi durdur
|
550 |
-
await session.stop_stt_streaming()
|
551 |
-
|
552 |
-
# State'i değiştir
|
553 |
-
await session.change_state(ConversationState.PROCESSING_STT)
|
554 |
-
|
555 |
-
# State change mesajı gönder
|
556 |
-
if session.is_websocket_active:
|
557 |
-
await websocket.send_json({
|
558 |
-
"type": "state_change",
|
559 |
-
"from": "listening",
|
560 |
-
"to": "processing_stt"
|
561 |
-
})
|
562 |
-
|
563 |
-
# Process user input
|
564 |
-
await process_user_input(websocket, session)
|
565 |
-
return
|
566 |
-
|
567 |
-
except Exception as e:
|
568 |
-
error_msg = str(e)
|
569 |
-
# Google STT timeout hatası kontrolü
|
570 |
-
if "Audio Timeout Error" in error_msg or "stream duration" in error_msg or "Exceeded maximum allowed stream duration" in error_msg:
|
571 |
-
log_warning(f"⚠️ STT timeout detected, ignoring", session_id=session.session.session_id)
|
572 |
-
# Timeout durumunda STT'yi yeniden başlatmaya gerek yok,
|
573 |
-
# çünkü kullanıcı konuşmayı bitirdiğinde zaten yeniden başlatılacak
|
574 |
-
else:
|
575 |
-
log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id)
|
576 |
-
await websocket.send_json({
|
577 |
-
"type": "error",
|
578 |
-
"error_type": "stt_error",
|
579 |
-
"message": f"STT error: {str(e)}"
|
580 |
-
})
|
581 |
-
|
582 |
-
except Exception as e:
|
583 |
-
log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id)
|
584 |
-
await websocket.send_json({
|
585 |
-
"type": "error",
|
586 |
-
"error_type": "audio_error",
|
587 |
-
"message": f"Audio processing error: {str(e)}"
|
588 |
-
})
|
589 |
-
|
590 |
-
# ========================= MAIN HANDLER =========================
|
591 |
-
async def websocket_endpoint(websocket: WebSocket, session_id: str):
|
592 |
-
"""Main WebSocket endpoint for real-time conversation"""
|
593 |
-
log_info(f"🔌 WebSocket connection attempt", session_id=session_id)
|
594 |
-
|
595 |
-
await websocket.accept()
|
596 |
-
log_info(f"✅ WebSocket accepted", session_id=session_id)
|
597 |
-
|
598 |
-
# Get session
|
599 |
-
session = session_store.get_session(session_id)
|
600 |
-
if not session:
|
601 |
-
log_error(f"❌ Session not found", session_id=session_id)
|
602 |
-
await websocket.send_json({
|
603 |
-
"type": "error",
|
604 |
-
"message": "Session not found"
|
605 |
-
})
|
606 |
-
await websocket.close()
|
607 |
-
return
|
608 |
-
|
609 |
-
log_info(f"✅ Session found", session_id=session_id, project=session.project_name)
|
610 |
-
|
611 |
-
# Mark as realtime session
|
612 |
-
session.is_realtime = True
|
613 |
-
session_store.update_session(session)
|
614 |
-
|
615 |
-
# Initialize conversation
|
616 |
-
realtime_session = RealtimeSession(session)
|
617 |
-
|
618 |
-
# STT'yi burada başlatmıyoruz, welcome mesajından sonra başlatacağız
|
619 |
-
log_info(f"⏳ STT initialization will be done after welcome message", session_id=session_id)
|
620 |
-
|
621 |
-
# Send session started confirmation
|
622 |
-
await websocket.send_json({
|
623 |
-
"type": "session_started",
|
624 |
-
"session_id": session_id,
|
625 |
-
"stt_initialized": False
|
626 |
-
})
|
627 |
-
|
628 |
-
# Send welcome message from session history
|
629 |
-
log_info(f"📋 Checking for welcome message in session history...", session_id=session_id)
|
630 |
-
|
631 |
-
# chat_history değişkenini session'dan al
|
632 |
-
chat_history = session.chat_history
|
633 |
-
|
634 |
-
if chat_history and len(chat_history) > 0:
|
635 |
-
log_info(f"📋 Found {len(chat_history)} messages in history", session_id=session_id)
|
636 |
-
|
637 |
-
# Get the last assistant message (welcome message)
|
638 |
-
for i, msg in enumerate(reversed(chat_history)):
|
639 |
-
log_debug(f"📋 Message {i}: role={msg.get('role', 'unknown')}, content_preview={msg.get('content', '')[:50]}...", session_id=session_id)
|
640 |
-
|
641 |
-
if msg.get('role') == 'assistant':
|
642 |
-
welcome_text = msg.get('content', '')
|
643 |
-
log_info(f"📢 Found welcome message: {welcome_text[:50]}...", session_id=session_id)
|
644 |
-
|
645 |
-
await realtime_session.change_state(ConversationState.PLAYING_AUDIO)
|
646 |
-
|
647 |
-
# Send text first
|
648 |
-
try:
|
649 |
-
await websocket.send_json({
|
650 |
-
"type": "assistant_response",
|
651 |
-
"text": welcome_text,
|
652 |
-
"is_welcome": True
|
653 |
-
})
|
654 |
-
log_info(f"✅ Welcome text sent via WebSocket", session_id=session_id)
|
655 |
-
except Exception as e:
|
656 |
-
log_error(f"❌ Failed to send welcome text", error=str(e), session_id=session_id)
|
657 |
-
|
658 |
-
# Generate and send TTS if available
|
659 |
-
tts_provider = TTSFactory.create_provider()
|
660 |
-
if tts_provider:
|
661 |
-
try:
|
662 |
-
log_info(f"🎤 Generating welcome TTS...", session_id=session_id)
|
663 |
-
|
664 |
-
# State change bildirimi gönder
|
665 |
-
await websocket.send_json({
|
666 |
-
"type": "state_change",
|
667 |
-
"from": "idle",
|
668 |
-
"to": "playing_audio"
|
669 |
-
})
|
670 |
-
|
671 |
-
# TTS preprocessor kullan
|
672 |
-
from tts_preprocessor import TTSPreprocessor
|
673 |
-
preprocessor = TTSPreprocessor(language=session.locale)
|
674 |
-
processed_text = preprocessor.preprocess(
|
675 |
-
welcome_text,
|
676 |
-
tts_provider.get_preprocessing_flags()
|
677 |
-
)
|
678 |
-
|
679 |
-
# TTS oluştur
|
680 |
-
audio_data = await tts_provider.synthesize(processed_text)
|
681 |
-
|
682 |
-
if audio_data:
|
683 |
-
# Audio'yu base64'e çevir ve chunk'lara böl
|
684 |
-
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
685 |
-
chunk_size = 16384
|
686 |
-
total_length = len(audio_base64)
|
687 |
-
total_chunks = (total_length + chunk_size - 1) // chunk_size
|
688 |
-
|
689 |
-
log_info(f"📤 Sending welcome TTS in {total_chunks} chunks", session_id=session_id)
|
690 |
-
|
691 |
-
for i in range(0, total_length, chunk_size):
|
692 |
-
chunk = audio_base64[i:i + chunk_size]
|
693 |
-
chunk_index = i // chunk_size
|
694 |
-
is_last = chunk_index == total_chunks - 1
|
695 |
-
|
696 |
-
await websocket.send_json({
|
697 |
-
"type": "tts_audio",
|
698 |
-
"data": chunk,
|
699 |
-
"chunk_index": chunk_index,
|
700 |
-
"total_chunks": total_chunks,
|
701 |
-
"is_last": is_last,
|
702 |
-
"mime_type": "audio/mpeg"
|
703 |
-
})
|
704 |
-
|
705 |
-
log_info(f"✅ Welcome TTS sent", session_id=session_id)
|
706 |
-
except Exception as e:
|
707 |
-
log_error(f"❌ Failed to send welcome TTS", error=str(e), traceback=traceback.format_exc(), session_id=session_id)
|
708 |
-
# TTS hatası durumunda direkt listening moduna geç
|
709 |
-
await realtime_session.change_state(ConversationState.LISTENING)
|
710 |
-
await websocket.send_json({
|
711 |
-
"type": "state_change",
|
712 |
-
"from": "playing_audio",
|
713 |
-
"to": "listening"
|
714 |
-
})
|
715 |
-
# Ve STT'yi başlat
|
716 |
-
await realtime_session.initialize_stt()
|
717 |
-
else:
|
718 |
-
log_warning(f"⚠️ No TTS provider available", session_id=session_id)
|
719 |
-
# TTS yoksa direkt listening moduna geç ve STT başlat
|
720 |
-
await realtime_session.change_state(ConversationState.LISTENING)
|
721 |
-
await websocket.send_json({
|
722 |
-
"type": "state_change",
|
723 |
-
"from": "idle",
|
724 |
-
"to": "listening"
|
725 |
-
})
|
726 |
-
await realtime_session.initialize_stt()
|
727 |
-
|
728 |
-
break
|
729 |
-
else:
|
730 |
-
log_warning(f"⚠️ No assistant message found in history", session_id=session_id)
|
731 |
-
# Welcome mesajı yoksa direkt listening moduna geç
|
732 |
-
await realtime_session.change_state(ConversationState.LISTENING)
|
733 |
-
await websocket.send_json({
|
734 |
-
"type": "state_change",
|
735 |
-
"from": "idle",
|
736 |
-
"to": "listening"
|
737 |
-
})
|
738 |
-
await realtime_session.initialize_stt()
|
739 |
-
else:
|
740 |
-
log_warning(f"⚠️ No messages in session history", session_id=session_id)
|
741 |
-
# History yoksa direkt listening moduna geç
|
742 |
-
await realtime_session.change_state(ConversationState.LISTENING)
|
743 |
-
await websocket.send_json({
|
744 |
-
"type": "state_change",
|
745 |
-
"from": "idle",
|
746 |
-
"to": "listening"
|
747 |
-
})
|
748 |
-
await realtime_session.initialize_stt()
|
749 |
-
|
750 |
-
log_info(f"💬 Ready for conversation", session_id=session_id)
|
751 |
-
|
752 |
-
try:
|
753 |
-
while True:
|
754 |
-
try:
|
755 |
-
# WebSocket aktif mi kontrol et
|
756 |
-
if not realtime_session.is_websocket_active:
|
757 |
-
log_info(f"🔌 WebSocket inactive, breaking loop", session_id=session_id)
|
758 |
-
break
|
759 |
-
|
760 |
-
# Receive message with timeout
|
761 |
-
message = await asyncio.wait_for(
|
762 |
-
websocket.receive_json(),
|
763 |
-
timeout=60.0 # 60 second timeout
|
764 |
-
)
|
765 |
-
|
766 |
-
message_type = message.get("type")
|
767 |
-
# Debug log'u kaldırdık
|
768 |
-
|
769 |
-
if message_type == "audio_chunk":
|
770 |
-
await handle_audio_chunk(websocket, realtime_session, message)
|
771 |
-
|
772 |
-
elif message_type == "control":
|
773 |
-
await handle_control_message(websocket, realtime_session, message)
|
774 |
-
|
775 |
-
elif message_type == "ping":
|
776 |
-
# Keep-alive ping - log yapmadan
|
777 |
-
if realtime_session.is_websocket_active:
|
778 |
-
await websocket.send_json({"type": "pong"})
|
779 |
-
|
780 |
-
except asyncio.TimeoutError:
|
781 |
-
# Timeout log'unu da azaltalım - her timeout'ta değil
|
782 |
-
if realtime_session.is_websocket_active:
|
783 |
-
await websocket.send_json({"type": "ping"})
|
784 |
-
|
785 |
-
except WebSocketDisconnect as e:
|
786 |
-
log_info(f"🔌 WebSocket disconnected", session_id=session_id, code=e.code, reason=e.reason)
|
787 |
-
except Exception as e:
|
788 |
-
# WebSocket kapalıysa hata verme
|
789 |
-
if "WebSocket is not connected" not in str(e) and "Cannot call \"send\"" not in str(e):
|
790 |
-
log_error(
|
791 |
-
f"❌ WebSocket error",
|
792 |
-
error=str(e),
|
793 |
-
traceback=traceback.format_exc(),
|
794 |
-
session_id=session_id
|
795 |
-
)
|
796 |
-
|
797 |
-
# Error mesajı göndermeye çalışma, zaten kapalı olabilir
|
798 |
-
if realtime_session.is_websocket_active:
|
799 |
-
try:
|
800 |
-
await websocket.send_json({
|
801 |
-
"type": "error",
|
802 |
-
"message": str(e)
|
803 |
-
})
|
804 |
-
except:
|
805 |
-
pass
|
806 |
-
finally:
|
807 |
-
log_info(f"🧹 Cleaning up WebSocket connection", session_id=session_id)
|
808 |
-
await realtime_session.cleanup()
|
809 |
-
|
810 |
-
# WebSocket'in açık olup olmadığını kontrol et
|
811 |
-
try:
|
812 |
-
if websocket.client_state.value == 1: # 1 = CONNECTED state
|
813 |
-
await websocket.close()
|
814 |
-
except Exception as e:
|
815 |
-
log_debug(f"WebSocket already closed or error during close: {e}", session_id=session_id)
|
816 |
-
|
817 |
-
|
818 |
-
# ========================= PROCESSING FUNCTIONS =========================
|
819 |
-
async def process_user_input(websocket: WebSocket, session: RealtimeSession):
|
820 |
-
"""Process complete user input"""
|
821 |
-
try:
|
822 |
-
# Transcription'ı hemen sakla - stop_stt_streaming'den önce!
|
823 |
-
user_text = session.current_transcription
|
824 |
-
|
825 |
-
# LLM işlemesi başlamadan önce STT'nin tamamen durduğundan emin ol
|
826 |
-
await session.stop_stt_streaming()
|
827 |
-
|
828 |
-
# WebSocket aktif mi kontrol et
|
829 |
-
if not session.is_websocket_active:
|
830 |
-
return
|
831 |
-
|
832 |
-
user_text = session.current_transcription
|
833 |
-
if not user_text:
|
834 |
-
log_warning(f"⚠️ Empty transcription, returning to listening", session_id=session.session.session_id)
|
835 |
-
# Boş transcription durumunda listening'e dön ve STT'yi yeniden başlat
|
836 |
-
await session.change_state(ConversationState.LISTENING)
|
837 |
-
await session.audio_buffer.clear()
|
838 |
-
await session.reset_for_new_utterance()
|
839 |
-
await session.restart_stt_if_needed()
|
840 |
-
return
|
841 |
-
|
842 |
-
log_info(f"🎯 Processing user input", text=user_text, session_id=session.session.session_id)
|
843 |
-
|
844 |
-
# Send final transcription
|
845 |
-
if session.is_websocket_active:
|
846 |
-
await websocket.send_json({
|
847 |
-
"type": "transcription",
|
848 |
-
"text": user_text,
|
849 |
-
"is_final": True,
|
850 |
-
"confidence": 0.95
|
851 |
-
})
|
852 |
-
|
853 |
-
# State: LLM Processing
|
854 |
-
await session.change_state(ConversationState.PROCESSING_LLM)
|
855 |
-
if session.is_websocket_active:
|
856 |
-
await websocket.send_json({
|
857 |
-
"type": "state_change",
|
858 |
-
"from": "processing_stt",
|
859 |
-
"to": "processing_llm"
|
860 |
-
})
|
861 |
-
|
862 |
-
# Add to chat history
|
863 |
-
session.session.add_message("user", user_text)
|
864 |
-
|
865 |
-
# Get LLM response based on session state
|
866 |
-
log_info(f"🤖 Getting LLM response", session_state=session.session.state, session_id=session.session.session_id)
|
867 |
-
|
868 |
-
if session.session.state == "collect_params":
|
869 |
-
response_text = await handle_parameter_followup(session.session, user_text)
|
870 |
-
else:
|
871 |
-
response_text = await handle_new_message(session.session, user_text)
|
872 |
-
|
873 |
-
log_info(f"💬 LLM response: {response_text[:50]}...", session_id=session.session.session_id)
|
874 |
-
|
875 |
-
# Add response to history
|
876 |
-
session.session.add_message("assistant", response_text)
|
877 |
-
|
878 |
-
# Send text response
|
879 |
-
if session.is_websocket_active:
|
880 |
-
await websocket.send_json({
|
881 |
-
"type": "assistant_response",
|
882 |
-
"text": response_text
|
883 |
-
})
|
884 |
-
|
885 |
-
# Generate TTS if enabled
|
886 |
-
tts_provider = TTSFactory.create_provider()
|
887 |
-
log_info(f"🔍 TTS provider check: {tts_provider is not None}", session_id=session.session.session_id)
|
888 |
-
|
889 |
-
if tts_provider and session.is_websocket_active:
|
890 |
-
await session.change_state(ConversationState.PROCESSING_TTS)
|
891 |
-
if session.is_websocket_active:
|
892 |
-
await websocket.send_json({
|
893 |
-
"type": "state_change",
|
894 |
-
"from": "processing_llm",
|
895 |
-
"to": "processing_tts"
|
896 |
-
})
|
897 |
-
|
898 |
-
log_info(f"🎵 Starting TTS generation for response", session_id=session.session.session_id)
|
899 |
-
|
900 |
-
# Generate TTS (barge-in devre dışı)
|
901 |
-
await generate_and_stream_tts(websocket, session, tts_provider, response_text)
|
902 |
-
|
903 |
-
# TTS bittikten sonra LISTENING state'ine geç
|
904 |
-
await session.change_state(ConversationState.LISTENING)
|
905 |
-
if session.is_websocket_active:
|
906 |
-
await websocket.send_json({
|
907 |
-
"type": "state_change",
|
908 |
-
"from": "playing_audio",
|
909 |
-
"to": "listening"
|
910 |
-
})
|
911 |
-
|
912 |
-
# STT'yi yeniden başlat
|
913 |
-
log_info(f"🔄 Restarting STT after TTS completion", session_id=session.session.session_id)
|
914 |
-
await session.restart_stt_if_needed()
|
915 |
-
|
916 |
-
else:
|
917 |
-
log_info(f"⚠️ No TTS provider or WebSocket inactive", session_id=session.session.session_id)
|
918 |
-
# No TTS, go back to listening and restart STT
|
919 |
-
await session.change_state(ConversationState.LISTENING)
|
920 |
-
if session.is_websocket_active:
|
921 |
-
await websocket.send_json({
|
922 |
-
"type": "state_change",
|
923 |
-
"from": "processing_llm",
|
924 |
-
"to": "listening"
|
925 |
-
})
|
926 |
-
await session.restart_stt_if_needed()
|
927 |
-
|
928 |
-
except Exception as e:
|
929 |
-
log_error(
|
930 |
-
f"❌ Error processing user input",
|
931 |
-
error=str(e),
|
932 |
-
traceback=traceback.format_exc(),
|
933 |
-
session_id=session.session.session_id
|
934 |
-
)
|
935 |
-
if session.is_websocket_active:
|
936 |
-
await websocket.send_json({
|
937 |
-
"type": "error",
|
938 |
-
"message": f"Processing error: {str(e)}"
|
939 |
-
})
|
940 |
-
await session.reset_for_new_utterance()
|
941 |
-
# Hata durumunda listening'e dön ve STT'yi yeniden başlat
|
942 |
-
await session.change_state(ConversationState.LISTENING)
|
943 |
-
await session.restart_stt_if_needed()
|
944 |
-
|
945 |
-
async def generate_and_stream_tts(
|
946 |
-
websocket: WebSocket,
|
947 |
-
session: RealtimeSession,
|
948 |
-
tts_provider,
|
949 |
-
text: str
|
950 |
-
):
|
951 |
-
"""Generate and stream TTS audio with sequential processing"""
|
952 |
-
try:
|
953 |
-
# TTS başlamadan önce STT'nin tamamen durduğundan emin ol
|
954 |
-
await session.stop_stt_streaming()
|
955 |
-
|
956 |
-
log_info(f"🎤 Starting TTS generation for text: '{text[:50]}...'", session_id=session.session.session_id)
|
957 |
-
|
958 |
-
# TTS preprocessor kullan
|
959 |
-
from tts_preprocessor import TTSPreprocessor
|
960 |
-
preprocessor = TTSPreprocessor(language=session.session.locale)
|
961 |
-
processed_text = preprocessor.preprocess(
|
962 |
-
text,
|
963 |
-
tts_provider.get_preprocessing_flags()
|
964 |
-
)
|
965 |
-
|
966 |
-
log_debug(f"📝 Preprocessed text: '{processed_text[:50]}...'", session_id=session.session.session_id)
|
967 |
-
|
968 |
-
# Generate audio
|
969 |
-
audio_data = await tts_provider.synthesize(processed_text)
|
970 |
-
log_info(f"✅ TTS generated: {len(audio_data)} bytes, type: {type(audio_data)}", session_id=session.session.session_id)
|
971 |
-
|
972 |
-
# WebSocket aktif mi kontrol et
|
973 |
-
if not session.is_websocket_active:
|
974 |
-
log_warning(f"⚠️ WebSocket inactive, skipping TTS streaming", session_id=session.session.session_id)
|
975 |
-
return
|
976 |
-
|
977 |
-
# Change state to playing
|
978 |
-
await session.change_state(ConversationState.PLAYING_AUDIO)
|
979 |
-
if session.is_websocket_active:
|
980 |
-
await websocket.send_json({
|
981 |
-
"type": "state_change",
|
982 |
-
"from": "processing_tts",
|
983 |
-
"to": "playing_audio"
|
984 |
-
})
|
985 |
-
|
986 |
-
# Convert entire audio to base64 for transmission
|
987 |
-
import base64
|
988 |
-
log_debug(f"📦 Converting audio to base64...")
|
989 |
-
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
|
990 |
-
log_info(f"📊 Base64 conversion complete: {len(audio_base64)} chars from {len(audio_data)} bytes", session_id=session.session.session_id)
|
991 |
-
|
992 |
-
# Log first 100 chars of base64 to verify it's valid
|
993 |
-
log_debug(f"🔍 Base64 preview: {audio_base64[:100]}...")
|
994 |
-
|
995 |
-
# Stream audio in chunks
|
996 |
-
chunk_size = 16384 # Larger chunk size for base64
|
997 |
-
total_length = len(audio_base64)
|
998 |
-
total_chunks = (total_length + chunk_size - 1) // chunk_size
|
999 |
-
|
1000 |
-
log_info(f"📤 Streaming TTS audio: {len(audio_data)} bytes as {total_length} base64 chars in {total_chunks} chunks", session_id=session.session.session_id)
|
1001 |
-
|
1002 |
-
for i in range(0, total_length, chunk_size):
|
1003 |
-
# WebSocket aktif mi kontrol et
|
1004 |
-
if not session.is_websocket_active:
|
1005 |
-
log_warning(f"⚠️ WebSocket inactive during streaming, stopping", session_id=session.session.session_id)
|
1006 |
-
break
|
1007 |
-
|
1008 |
-
chunk = audio_base64[i:i + chunk_size]
|
1009 |
-
chunk_index = i // chunk_size
|
1010 |
-
is_last = chunk_index == total_chunks - 1
|
1011 |
-
|
1012 |
-
log_debug(f"📨 Sending chunk {chunk_index}/{total_chunks}, size: {len(chunk)}, is_last: {is_last}")
|
1013 |
-
|
1014 |
-
if session.is_websocket_active:
|
1015 |
-
await websocket.send_json({
|
1016 |
-
"type": "tts_audio",
|
1017 |
-
"data": chunk,
|
1018 |
-
"chunk_index": chunk_index,
|
1019 |
-
"total_chunks": total_chunks,
|
1020 |
-
"is_last": is_last,
|
1021 |
-
"mime_type": "audio/mpeg"
|
1022 |
-
})
|
1023 |
-
|
1024 |
-
# Small delay to prevent overwhelming the client
|
1025 |
-
await asyncio.sleep(0.01)
|
1026 |
-
|
1027 |
-
log_info(
|
1028 |
-
f"✅ TTS streaming completed successfully",
|
1029 |
-
session_id=session.session.session_id,
|
1030 |
-
text_length=len(text),
|
1031 |
-
audio_size=len(audio_data),
|
1032 |
-
chunks_sent=total_chunks
|
1033 |
-
)
|
1034 |
-
|
1035 |
-
# TTS bitimi - state değişimi process_user_input'ta yapılacak
|
1036 |
-
|
1037 |
-
except Exception as e:
|
1038 |
-
error_msg = str(e)
|
1039 |
-
log_error(
|
1040 |
-
f"❌ TTS generation error",
|
1041 |
-
error=error_msg,
|
1042 |
-
traceback=traceback.format_exc(),
|
1043 |
-
session_id=session.session.session_id
|
1044 |
-
)
|
1045 |
-
|
1046 |
-
# Quota hatası için özel handling
|
1047 |
-
if "quota_exceeded" in error_msg:
|
1048 |
-
if session.is_websocket_active:
|
1049 |
-
await websocket.send_json({
|
1050 |
-
"type": "tts_error",
|
1051 |
-
"message": "TTS servisinin kredi limiti aşıldı. Yanıt sadece metin olarak gösterilecek.",
|
1052 |
-
"error_type": "quota_exceeded"
|
1053 |
-
})
|
1054 |
-
else:
|
1055 |
-
if session.is_websocket_active:
|
1056 |
-
await websocket.send_json({
|
1057 |
-
"type": "error",
|
1058 |
-
"message": f"TTS error: {error_msg}"
|
1059 |
-
})
|
1060 |
-
|
1061 |
-
# TTS hatası durumunda listening'e dön
|
1062 |
-
await session.change_state(ConversationState.LISTENING)
|
1063 |
-
if session.is_websocket_active:
|
1064 |
-
await websocket.send_json({
|
1065 |
-
"type": "state_change",
|
1066 |
-
"from": "processing_tts",
|
1067 |
-
"to": "listening"
|
1068 |
-
})
|
1069 |
-
# STT'yi yeniden başlat
|
1070 |
-
await session.restart_stt_if_needed()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|