Spaces:
Building
Building
""" | |
WebSocket Handler for Real-time STT/TTS with Barge-in Support | |
""" | |
from fastapi import WebSocket, WebSocketDisconnect | |
from typing import Dict, Any, Optional | |
import json | |
import asyncio | |
import base64 | |
from datetime import datetime | |
from collections import deque | |
from enum import Enum | |
import numpy as np | |
import traceback | |
from session import Session, session_store | |
from config_provider import ConfigProvider | |
from chat_handler import handle_new_message, handle_parameter_followup | |
from stt_factory import STTFactory | |
from tts_factory import TTSFactory | |
from logger import log_info, log_error, log_debug, log_warning | |
# ========================= CONSTANTS ========================= | |
# Default values - will be overridden by config | |
DEFAULT_SILENCE_THRESHOLD_MS = 2000 | |
DEFAULT_AUDIO_CHUNK_SIZE = 4096 | |
DEFAULT_ENERGY_THRESHOLD = 0.0005 # 0.01 | |
DEFAULT_AUDIO_BUFFER_MAX_SIZE = 1000 | |
# ========================= ENUMS ========================= | |
class ConversationState(Enum): | |
IDLE = "idle" | |
LISTENING = "listening" | |
PROCESSING_STT = "processing_stt" | |
PROCESSING_LLM = "processing_llm" | |
PROCESSING_TTS = "processing_tts" | |
PLAYING_AUDIO = "playing_audio" | |
# ========================= CLASSES ========================= | |
class AudioBuffer: | |
"""Thread-safe circular buffer for audio chunks""" | |
def __init__(self, max_size: int = DEFAULT_AUDIO_BUFFER_MAX_SIZE): | |
self.buffer = deque(maxlen=max_size) | |
self.lock = asyncio.Lock() | |
async def add_chunk(self, chunk_data: str): | |
"""Add base64 encoded audio chunk""" | |
async with self.lock: | |
decoded = base64.b64decode(chunk_data) | |
self.buffer.append(decoded) | |
async def get_all_audio(self) -> bytes: | |
"""Get all audio data concatenated""" | |
async with self.lock: | |
return b''.join(self.buffer) | |
async def clear(self): | |
"""Clear buffer""" | |
async with self.lock: | |
self.buffer.clear() | |
def size(self) -> int: | |
"""Get current buffer size""" | |
return len(self.buffer) | |
class SilenceDetector: | |
"""Detect silence in audio stream""" | |
def __init__(self, threshold_ms: int = DEFAULT_SILENCE_THRESHOLD_MS, energy_threshold: float = DEFAULT_ENERGY_THRESHOLD): | |
self.threshold_ms = threshold_ms | |
self.energy_threshold = energy_threshold | |
self.silence_start = None | |
self.sample_rate = 16000 | |
def update(self, audio_chunk: bytes) -> int: | |
"""Update with new audio chunk and return silence duration in ms""" | |
if self.is_silence(audio_chunk): | |
if self.silence_start is None: | |
self.silence_start = datetime.now() | |
silence_duration = (datetime.now() - self.silence_start).total_seconds() * 1000 | |
return int(silence_duration) | |
else: | |
self.silence_start = None | |
return 0 | |
def is_silence(self, audio_chunk: bytes) -> bool: | |
"""Check if audio chunk is silence""" | |
try: | |
# Audio chunk boyutunu kontrol et | |
if len(audio_chunk) == 0: | |
return True | |
# Chunk boyutu 2'nin katı olmalı (16-bit audio için) | |
if len(audio_chunk) % 2 != 0: | |
# Tek byte varsa, son byte'ı at | |
audio_chunk = audio_chunk[:-1] | |
# Convert bytes to numpy array (assuming 16-bit PCM) | |
audio_data = np.frombuffer(audio_chunk, dtype=np.int16) | |
# RMS hesapla | |
if len(audio_data) == 0: | |
return True | |
rms = np.sqrt(np.mean(audio_data.astype(float) ** 2)) | |
normalized_rms = rms / 32768.0 | |
return normalized_rms < self.energy_threshold | |
except Exception as e: | |
log_warning(f"Silence detection error: {e}") | |
return False | |
def reset(self): | |
"""Reset silence detection""" | |
self.silence_start = None | |
class BargeInHandler: | |
"""Handle user interruptions during TTS playback""" | |
def __init__(self): | |
self.active_tts_task: Optional[asyncio.Task] = None | |
self.is_interrupting = False | |
self.lock = asyncio.Lock() | |
async def start_tts_task(self, coro): | |
"""Start a cancellable TTS task""" | |
async with self.lock: | |
# Cancel any existing task | |
if self.active_tts_task and not self.active_tts_task.done(): | |
self.active_tts_task.cancel() | |
try: | |
await self.active_tts_task | |
except asyncio.CancelledError: | |
pass | |
# Start new task | |
self.active_tts_task = asyncio.create_task(coro) | |
return self.active_tts_task | |
async def handle_interruption(self, current_state: ConversationState): | |
"""Handle barge-in interruption""" | |
async with self.lock: | |
self.is_interrupting = True | |
# Cancel TTS if active | |
if self.active_tts_task and not self.active_tts_task.done(): | |
log_info("Barge-in: Cancelling active TTS") | |
self.active_tts_task.cancel() | |
try: | |
await self.active_tts_task | |
except asyncio.CancelledError: | |
pass | |
# Reset flag after short delay | |
await asyncio.sleep(0.5) | |
self.is_interrupting = False | |
class RealtimeSession: | |
"""Manage a real-time conversation session""" | |
def __init__(self, session: Session): | |
self.session = session | |
self.state = ConversationState.IDLE | |
self.is_websocket_active = True | |
# Get settings from config | |
config = ConfigProvider.get().global_config.stt_provider.settings | |
# Initialize with config values or defaults | |
silence_threshold = config.get("speech_timeout_ms", DEFAULT_SILENCE_THRESHOLD_MS) | |
energy_threshold = config.get("energy_threshold", DEFAULT_ENERGY_THRESHOLD) | |
buffer_max_size = config.get("audio_buffer_max_size", DEFAULT_AUDIO_BUFFER_MAX_SIZE) | |
self.audio_buffer = AudioBuffer(max_size=buffer_max_size) | |
self.silence_detector = SilenceDetector( | |
threshold_ms=silence_threshold, | |
energy_threshold=energy_threshold | |
) | |
self.barge_in_handler = BargeInHandler() | |
self.stt_manager = None | |
self.current_transcription = "" | |
self.is_streaming = False | |
self.lock = asyncio.Lock() | |
# Store config for later use | |
self.audio_chunk_size = config.get("audio_chunk_size", DEFAULT_AUDIO_CHUNK_SIZE) | |
self.silence_threshold_ms = silence_threshold | |
# Chunk counter için attribute | |
self.chunk_counter = 0 | |
async def initialize_stt(self): | |
"""Initialize STT provider""" | |
try: | |
# Her başlatmada chunk counter'ı sıfırla | |
self.chunk_counter = 0 | |
self.stt_manager = STTFactory.create_provider() | |
if not self.stt_manager: | |
log_error("❌ STT manager is None - STTFactory.create_provider() returned None", session_id=self.session.session_id) | |
return False | |
log_info(f"✅ STT manager created: {type(self.stt_manager).__name__}", session_id=self.session.session_id) | |
# Get STT config from provider settings | |
config = ConfigProvider.get().global_config.stt_provider.settings | |
# Get language from session locale | |
session_locale = getattr(self.session, 'locale', 'tr') | |
# Import LocaleManager to get proper locale tag | |
from locale_manager import LocaleManager | |
locale_data = LocaleManager.get_locale(session_locale) | |
# Get proper locale tag for STT (e.g., tr -> tr-TR) | |
language_code = locale_data.get('locale_tag', 'tr-TR') | |
log_info(f"🌍 Session locale: {session_locale}, STT language: {language_code}", session_id=self.session.session_id) | |
stt_config = { | |
"language": language_code, | |
"interim_results": config.get("interim_results", True), | |
"single_utterance": True, | |
"enable_punctuation": config.get("enable_punctuation", True), | |
"sample_rate": 16000, | |
"encoding": "WEBM_OPUS" | |
} | |
log_info(f"🎤 Starting STT streaming with config: {stt_config}", session_id=self.session.session_id) | |
# Start streaming | |
await self.stt_manager.start_streaming(stt_config) | |
self.is_streaming = True | |
log_info("✅ STT streaming started successfully", session_id=self.session.session_id) | |
return True | |
except Exception as e: | |
log_error(f"❌ Failed to initialize STT", error=str(e), traceback=traceback.format_exc(), session_id=self.session.session_id) | |
self.stt_manager = None | |
self.is_streaming = False | |
self.chunk_counter = 0 | |
return False | |
async def restart_stt_if_needed(self): | |
"""Restart STT if it's not active""" | |
try: | |
# Sadece LISTENING state'inde ve WebSocket aktifse restart yap | |
if not self.is_streaming and self.is_websocket_active and self.state == ConversationState.LISTENING: | |
log_info(f"🔄 Restarting STT stream...", session_id=self.session.session_id) | |
# Önce mevcut stream'i temizle | |
await self.stop_stt_streaming() | |
# Sonra yeniden başlat | |
stt_initialized = await self.initialize_stt() | |
if stt_initialized: | |
log_info(f"✅ STT stream restarted successfully", session_id=self.session.session_id) | |
return True | |
else: | |
log_error(f"❌ Failed to restart STT stream", session_id=self.session.session_id) | |
return False | |
return True | |
except Exception as e: | |
log_error(f"❌ Error restarting STT", error=str(e), session_id=self.session.session_id) | |
return False | |
async def stop_stt_streaming(self): | |
"""Stop STT streaming completely""" | |
try: | |
if self.stt_manager and self.is_streaming: | |
log_info(f"🛑 Stopping STT stream", session_id=self.session.session_id) | |
await self.stt_manager.stop_streaming() | |
self.is_streaming = False | |
self.chunk_counter = 0 | |
# STT manager'ı sıfırla - yeni instance oluşturulması için | |
self.stt_manager = None | |
log_info(f"✅ STT stream stopped and manager reset", session_id=self.session.session_id) | |
except Exception as e: | |
log_warning(f"⚠️ Error stopping STT stream: {e}", session_id=self.session.session_id) | |
self.is_streaming = False | |
self.chunk_counter = 0 | |
self.stt_manager = None | |
async def restart_stt_if_needed(self): | |
"""Restart STT if it's not active""" | |
try: | |
# Sadece LISTENING state'inde ve WebSocket aktifse restart yap | |
if not self.is_streaming and self.is_websocket_active and self.state == ConversationState.LISTENING: | |
log_info(f"🔄 Restarting STT stream...", session_id=self.session.session_id) | |
# Önce mevcut stream'i temizle (eğer varsa) | |
if self.stt_manager: | |
await self.stop_stt_streaming() | |
# Biraz bekle - Google API'nin toparlanması için | |
await asyncio.sleep(0.5) | |
# Sonra yeniden başlat | |
stt_initialized = await self.initialize_stt() | |
if stt_initialized: | |
log_info(f"✅ STT stream restarted successfully", session_id=self.session.session_id) | |
return True | |
else: | |
log_error(f"❌ Failed to restart STT stream", session_id=self.session.session_id) | |
return False | |
return True | |
except Exception as e: | |
log_error(f"❌ Error restarting STT", error=str(e), traceback=traceback.format_exc(), session_id=self.session.session_id) | |
return False | |
async def change_state(self, new_state: ConversationState): | |
"""Change conversation state""" | |
async with self.lock: | |
old_state = self.state | |
self.state = new_state | |
log_debug( | |
f"State change: {old_state.value} → {new_state.value}", | |
session_id=self.session.session_id | |
) | |
async def handle_barge_in(self): | |
"""Handle user interruption""" | |
# Barge-in devre dışı - bu metod artık çağrılmamalı | |
log_warning(f"⚠️ Barge-in called but disabled", session_id=self.session.session_id) | |
return | |
async def reset_for_new_utterance(self): | |
"""Reset for new user utterance""" | |
await self.audio_buffer.clear() | |
self.silence_detector.reset() | |
self.current_transcription = "" | |
self.chunk_counter = 0 # Chunk counter'ı reset et | |
if hasattr(self, 'speech_started'): | |
delattr(self, 'speech_started') | |
log_info(f"🔄 Reset for new utterance complete", session_id=self.session.session_id) | |
async def cleanup(self): | |
"""Clean up resources""" | |
try: | |
self.is_websocket_active = False | |
await self.stop_stt_streaming() # STT'yi düzgün durdur | |
log_info(f"Cleaned up realtime session", session_id=self.session.session_id) | |
except Exception as e: | |
log_warning(f"Cleanup error", error=str(e), session_id=self.session.session_id) | |
# ========================= MAIN HANDLER ========================= | |
async def websocket_endpoint(websocket: WebSocket, session_id: str): | |
"""Main WebSocket endpoint for real-time conversation""" | |
log_info(f"🔌 WebSocket connection attempt", session_id=session_id) | |
await websocket.accept() | |
log_info(f"✅ WebSocket accepted", session_id=session_id) | |
# Get session | |
session = session_store.get_session(session_id) | |
if not session: | |
log_error(f"❌ Session not found", session_id=session_id) | |
await websocket.send_json({ | |
"type": "error", | |
"message": "Session not found" | |
}) | |
await websocket.close() | |
return | |
log_info(f"✅ Session found", session_id=session_id, project=session.project_name) | |
# Mark as realtime session | |
session.is_realtime = True | |
session_store.update_session(session) | |
# Initialize conversation | |
realtime_session = RealtimeSession(session) | |
# Initialize STT | |
log_info(f"🎤 Initializing STT...", session_id=session_id) | |
stt_initialized = await realtime_session.initialize_stt() | |
if not stt_initialized: | |
log_error(f"❌ STT initialization failed", session_id=session_id) | |
await websocket.send_json({ | |
"type": "error", | |
"message": "STT initialization failed" | |
}) | |
else: | |
log_info(f"✅ STT initialized", session_id=session_id) | |
# Send session started confirmation | |
await websocket.send_json({ | |
"type": "session_started", | |
"session_id": session_id, | |
"stt_initialized": stt_initialized | |
}) | |
# Send welcome message from session history | |
log_info(f"📋 Checking for welcome message in session history...", session_id=session_id) | |
# chat_history değişkenini session'dan al | |
chat_history = session.chat_history | |
if chat_history and len(chat_history) > 0: | |
log_info(f"📋 Found {len(chat_history)} messages in history", session_id=session_id) | |
# Get the last assistant message (welcome message) | |
for i, msg in enumerate(reversed(chat_history)): | |
log_debug(f"📋 Message {i}: role={msg.get('role', 'unknown')}, content_preview={msg.get('content', '')[:50]}...", session_id=session_id) | |
if msg.get('role') == 'assistant': | |
welcome_text = msg.get('content', '') | |
log_info(f"📢 Found welcome message: {welcome_text[:50]}...", session_id=session_id) | |
# Send text first | |
try: | |
await websocket.send_json({ | |
"type": "assistant_response", | |
"text": welcome_text, | |
"is_welcome": True | |
}) | |
log_info(f"✅ Welcome text sent via WebSocket", session_id=session_id) | |
except Exception as e: | |
log_error(f"❌ Failed to send welcome text", error=str(e), session_id=session_id) | |
# Generate and send TTS if available | |
tts_provider = TTSFactory.create_provider() | |
if tts_provider: | |
try: | |
log_info(f"🎤 Generating welcome TTS...", session_id=session_id) | |
# TTS preprocessor kullan | |
from tts_preprocessor import TTSPreprocessor | |
preprocessor = TTSPreprocessor(language=session.locale) | |
processed_text = preprocessor.preprocess( | |
welcome_text, | |
tts_provider.get_preprocessing_flags() | |
) | |
# TTS oluştur | |
audio_data = await tts_provider.synthesize(processed_text) | |
if audio_data: | |
# Audio'yu base64'e çevir ve chunk'lara böl | |
audio_base64 = base64.b64encode(audio_data).decode('utf-8') | |
chunk_size = 16384 | |
total_length = len(audio_base64) | |
total_chunks = (total_length + chunk_size - 1) // chunk_size | |
log_info(f"📤 Sending welcome TTS in {total_chunks} chunks", session_id=session_id) | |
for i in range(0, total_length, chunk_size): | |
chunk = audio_base64[i:i + chunk_size] | |
chunk_index = i // chunk_size | |
is_last = chunk_index == total_chunks - 1 | |
await websocket.send_json({ | |
"type": "tts_audio", | |
"data": chunk, | |
"chunk_index": chunk_index, | |
"total_chunks": total_chunks, | |
"is_last": is_last, | |
"mime_type": "audio/mpeg" | |
}) | |
log_info(f"✅ Welcome TTS sent", session_id=session_id) | |
except Exception as e: | |
log_error(f"❌ Failed to send welcome TTS", error=str(e), traceback=traceback.format_exc(), session_id=session_id) | |
else: | |
log_warning(f"⚠️ No TTS provider available", session_id=session_id) | |
break | |
else: | |
log_warning(f"⚠️ No assistant message found in history", session_id=session_id) | |
else: | |
log_warning(f"⚠️ No messages in session history", session_id=session_id) | |
log_info(f"💬 Ready for conversation", session_id=session_id) | |
try: | |
while True: | |
try: | |
# WebSocket aktif mi kontrol et | |
if not realtime_session.is_websocket_active: | |
log_info(f"🔌 WebSocket inactive, breaking loop", session_id=session_id) | |
break | |
# Receive message with timeout | |
message = await asyncio.wait_for( | |
websocket.receive_json(), | |
timeout=60.0 # 60 second timeout | |
) | |
message_type = message.get("type") | |
# Debug log'u kaldırdık | |
if message_type == "audio_chunk": | |
await handle_audio_chunk(websocket, realtime_session, message) | |
elif message_type == "control": | |
await handle_control_message(websocket, realtime_session, message) | |
elif message_type == "ping": | |
# Keep-alive ping - log yapmadan | |
if realtime_session.is_websocket_active: | |
await websocket.send_json({"type": "pong"}) | |
except asyncio.TimeoutError: | |
# Timeout log'unu da azaltalım - her timeout'ta değil | |
if realtime_session.is_websocket_active: | |
await websocket.send_json({"type": "ping"}) | |
except WebSocketDisconnect as e: | |
log_info(f"🔌 WebSocket disconnected", session_id=session_id, code=e.code, reason=e.reason) | |
except Exception as e: | |
# WebSocket kapalıysa hata verme | |
if "WebSocket is not connected" not in str(e) and "Cannot call \"send\"" not in str(e): | |
log_error( | |
f"❌ WebSocket error", | |
error=str(e), | |
traceback=traceback.format_exc(), | |
session_id=session_id | |
) | |
# Error mesajı göndermeye çalışma, zaten kapalı olabilir | |
if realtime_session.is_websocket_active: | |
try: | |
await websocket.send_json({ | |
"type": "error", | |
"message": str(e) | |
}) | |
except: | |
pass | |
finally: | |
log_info(f"🧹 Cleaning up WebSocket connection", session_id=session_id) | |
await realtime_session.cleanup() | |
# WebSocket'in açık olup olmadığını kontrol et | |
try: | |
if websocket.client_state.value == 1: # 1 = CONNECTED state | |
await websocket.close() | |
except Exception as e: | |
log_debug(f"WebSocket already closed or error during close: {e}", session_id=session_id) | |
# ========================= MESSAGE HANDLERS ========================= | |
async def handle_audio_chunk(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]): | |
"""Handle incoming audio chunk with sequential processing""" | |
try: | |
# WebSocket kapandıysa işlem yapma | |
if not session.is_websocket_active: | |
return | |
audio_data = message.get("data") | |
if not audio_data: | |
log_warning(f"⚠️ Empty audio chunk received", session_id=session.session.session_id) | |
return | |
# Barge-in devre dışı - TTS/audio playback sırasında audio chunk'ları işleme | |
if session.state in [ConversationState.PLAYING_AUDIO, ConversationState.PROCESSING_TTS, | |
ConversationState.PROCESSING_LLM, ConversationState.PROCESSING_STT]: | |
log_debug(f"🔇 Ignoring audio chunk during state: {session.state.value}", session_id=session.session.session_id) | |
return | |
# Change state to listening if idle | |
if session.state == ConversationState.IDLE: | |
await session.change_state(ConversationState.LISTENING) | |
await websocket.send_json({ | |
"type": "state_change", | |
"from": "idle", | |
"to": "listening" | |
}) | |
# IDLE'dan LISTENING'e geçerken STT'yi başlat | |
if not session.is_streaming: | |
await session.restart_stt_if_needed() | |
# LISTENING state'inde değilse audio işleme | |
if session.state != ConversationState.LISTENING: | |
return | |
# Add to buffer | |
await session.audio_buffer.add_chunk(audio_data) | |
# Decode for processing | |
decoded_audio = base64.b64decode(audio_data) | |
# Check silence | |
silence_duration = session.silence_detector.update(decoded_audio) | |
# Stream to STT if available and in LISTENING state | |
if session.stt_manager and session.state == ConversationState.LISTENING: | |
# Ensure streaming is active | |
if not session.is_streaming: | |
log_warning(f"⚠️ STT not streaming, attempting to restart", session_id=session.session.session_id) | |
restart_success = await session.restart_stt_if_needed() | |
if not restart_success: | |
await websocket.send_json({ | |
"type": "error", | |
"error_type": "stt_error", | |
"message": "STT streaming not available" | |
}) | |
return | |
try: | |
# Chunk counter artır | |
session.chunk_counter += 1 | |
if session.chunk_counter == 1: | |
log_info(f"🎤 Started streaming audio to STT", session_id=session.session.session_id) | |
elif session.chunk_counter % 100 == 0: | |
log_info(f"📊 Sent {session.chunk_counter} chunks to STT so far...", session_id=session.session.session_id) | |
# STT'ye gönder ve sonuçları bekle | |
async for result in session.stt_manager.stream_audio(decoded_audio): | |
# SADECE FINAL RESULT'LARI İŞLE | |
if result.is_final: | |
log_info(f"✅ FINAL TRANSCRIPTION: '{result.text}'", session_id=session.session.session_id) | |
# Send ONLY final transcription to frontend | |
await websocket.send_json({ | |
"type": "transcription", | |
"text": result.text, | |
"is_final": True, | |
"confidence": result.confidence | |
}) | |
session.current_transcription = result.text | |
# Final transcription geldiğinde STT'yi durdur ve işle | |
if session.current_transcription: | |
# Önce STT'yi durdur | |
await session.stop_stt_streaming() | |
# State'i değiştir | |
await session.change_state(ConversationState.PROCESSING_STT) | |
await websocket.send_json({ | |
"type": "state_change", | |
"from": "listening", | |
"to": "processing_stt" | |
}) | |
# Process user input | |
await process_user_input(websocket, session) | |
# Reset for new utterance | |
await session.reset_for_new_utterance() | |
return | |
except Exception as e: | |
error_msg = str(e) | |
# Google STT timeout hatası kontrolü | |
if "Audio Timeout Error" in error_msg or "stream duration" in error_msg or "Exceeded maximum allowed stream duration" in error_msg: | |
log_warning(f"⚠️ STT timeout detected, restarting stream", session_id=session.session.session_id) | |
session.is_streaming = False | |
session.chunk_counter = 0 | |
# Timeout durumunda yeniden başlat | |
await session.restart_stt_if_needed() | |
else: | |
log_error(f"❌ STT streaming error", error=error_msg, traceback=traceback.format_exc(), session_id=session.session.session_id) | |
await websocket.send_json({ | |
"type": "error", | |
"error_type": "stt_error", | |
"message": f"STT error: {str(e)}" | |
}) | |
except Exception as e: | |
log_error(f"❌ Error in handle_audio_chunk", error=str(e), traceback=traceback.format_exc(), session_id=session.session.session_id) | |
await websocket.send_json({ | |
"type": "error", | |
"error_type": "audio_error", | |
"message": f"Audio processing error: {str(e)}" | |
}) | |
async def handle_control_message(websocket: WebSocket, session: RealtimeSession, message: Dict[str, Any]): | |
"""Handle control messages""" | |
action = message.get("action") | |
config = message.get("config", {}) | |
log_debug(f"🎮 Control message", action=action, session_id=session.session.session_id) | |
if action == "start_session": | |
# Session configuration | |
await websocket.send_json({ | |
"type": "session_config", | |
"session_id": session.session.session_id, | |
"config": { | |
"silence_threshold_ms": session.silence_threshold_ms, | |
"audio_chunk_size": session.audio_chunk_size, | |
"supports_barge_in": False # Barge-in devre dışı | |
} | |
}) | |
elif action == "end_session" or action == "stop_session": | |
# Clean up and close | |
await session.cleanup() | |
await websocket.close() | |
elif action == "interrupt": | |
# Barge-in devre dışı - ignore | |
log_warning(f"⚠️ Interrupt request ignored (barge-in disabled)", session_id=session.session.session_id) | |
elif action == "reset": | |
# Reset conversation state | |
await session.reset_for_new_utterance() | |
await session.stop_stt_streaming() | |
await session.change_state(ConversationState.IDLE) | |
await websocket.send_json({ | |
"type": "state_change", | |
"from": session.state.value, | |
"to": "idle" | |
}) | |
elif action == "audio_ended": | |
# Audio playback ended on client | |
if session.state == ConversationState.PLAYING_AUDIO: | |
log_info(f"🎵 Client reported audio ended", session_id=session.session.session_id) | |
await session.change_state(ConversationState.LISTENING) | |
await websocket.send_json({ | |
"type": "state_change", | |
"from": "playing_audio", | |
"to": "listening" | |
}) | |
# STT'yi yeniden başlat | |
await session.restart_stt_if_needed() | |
elif action == "restart_stt": | |
# Manual STT restart request | |
log_info(f"🔄 Manual STT restart requested", session_id=session.session.session_id) | |
await session.stop_stt_streaming() | |
await session.restart_stt_if_needed() | |
# ========================= PROCESSING FUNCTIONS ========================= | |
async def process_user_input(websocket: WebSocket, session: RealtimeSession): | |
"""Process complete user input""" | |
try: | |
# LLM işlemesi başlamadan önce STT'nin tamamen durduğundan emin ol | |
await session.stop_stt_streaming() | |
# WebSocket aktif mi kontrol et | |
if not session.is_websocket_active: | |
return | |
user_text = session.current_transcription | |
if not user_text: | |
log_warning(f"⚠️ Empty transcription, returning to listening", session_id=session.session.session_id) | |
# Boş transcription durumunda listening'e dön ve STT'yi yeniden başlat | |
await session.change_state(ConversationState.LISTENING) | |
await session.restart_stt_if_needed() | |
return | |
log_info(f"🎯 Processing user input", text=user_text, session_id=session.session.session_id) | |
# Send final transcription | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "transcription", | |
"text": user_text, | |
"is_final": True, | |
"confidence": 0.95 | |
}) | |
# State: LLM Processing | |
await session.change_state(ConversationState.PROCESSING_LLM) | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "state_change", | |
"from": "processing_stt", | |
"to": "processing_llm" | |
}) | |
# Add to chat history | |
session.session.add_message("user", user_text) | |
# Get LLM response based on session state | |
log_info(f"🤖 Getting LLM response", session_state=session.session.state, session_id=session.session.session_id) | |
if session.session.state == "collect_params": | |
response_text = await handle_parameter_followup(session.session, user_text) | |
else: | |
response_text = await handle_new_message(session.session, user_text) | |
log_info(f"💬 LLM response: {response_text[:50]}...", session_id=session.session.session_id) | |
# Add response to history | |
session.session.add_message("assistant", response_text) | |
# Send text response | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "assistant_response", | |
"text": response_text | |
}) | |
# Generate TTS if enabled | |
tts_provider = TTSFactory.create_provider() | |
log_info(f"🔍 TTS provider check: {tts_provider is not None}", session_id=session.session.session_id) | |
if tts_provider and session.is_websocket_active: | |
await session.change_state(ConversationState.PROCESSING_TTS) | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "state_change", | |
"from": "processing_llm", | |
"to": "processing_tts" | |
}) | |
log_info(f"🎵 Starting TTS generation for response", session_id=session.session.session_id) | |
# Generate TTS (barge-in devre dışı) | |
await generate_and_stream_tts(websocket, session, tts_provider, response_text) | |
# TTS bittikten sonra LISTENING state'ine geç | |
await session.change_state(ConversationState.LISTENING) | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "state_change", | |
"from": "playing_audio", | |
"to": "listening" | |
}) | |
# STT'yi yeniden başlat | |
log_info(f"🔄 Restarting STT after TTS completion", session_id=session.session.session_id) | |
await session.restart_stt_if_needed() | |
else: | |
log_info(f"⚠️ No TTS provider or WebSocket inactive", session_id=session.session.session_id) | |
# No TTS, go back to listening and restart STT | |
await session.change_state(ConversationState.LISTENING) | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "state_change", | |
"from": "processing_llm", | |
"to": "listening" | |
}) | |
await session.restart_stt_if_needed() | |
except Exception as e: | |
log_error( | |
f"❌ Error processing user input", | |
error=str(e), | |
traceback=traceback.format_exc(), | |
session_id=session.session.session_id | |
) | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "error", | |
"message": f"Processing error: {str(e)}" | |
}) | |
await session.reset_for_new_utterance() | |
# Hata durumunda listening'e dön ve STT'yi yeniden başlat | |
await session.change_state(ConversationState.LISTENING) | |
await session.restart_stt_if_needed() | |
async def generate_and_stream_tts( | |
websocket: WebSocket, | |
session: RealtimeSession, | |
tts_provider, | |
text: str | |
): | |
"""Generate and stream TTS audio with sequential processing""" | |
try: | |
# TTS başlamadan önce STT'nin tamamen durduğundan emin ol | |
await session.stop_stt_streaming() | |
log_info(f"🎤 Starting TTS generation for text: '{text[:50]}...'", session_id=session.session.session_id) | |
# TTS preprocessor kullan | |
from tts_preprocessor import TTSPreprocessor | |
preprocessor = TTSPreprocessor(language=session.session.locale) | |
processed_text = preprocessor.preprocess( | |
text, | |
tts_provider.get_preprocessing_flags() | |
) | |
log_debug(f"📝 Preprocessed text: '{processed_text[:50]}...'", session_id=session.session.session_id) | |
# Generate audio | |
audio_data = await tts_provider.synthesize(processed_text) | |
log_info(f"✅ TTS generated: {len(audio_data)} bytes, type: {type(audio_data)}", session_id=session.session.session_id) | |
# WebSocket aktif mi kontrol et | |
if not session.is_websocket_active: | |
log_warning(f"⚠️ WebSocket inactive, skipping TTS streaming", session_id=session.session.session_id) | |
return | |
# Change state to playing | |
await session.change_state(ConversationState.PLAYING_AUDIO) | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "state_change", | |
"from": "processing_tts", | |
"to": "playing_audio" | |
}) | |
# Convert entire audio to base64 for transmission | |
import base64 | |
log_debug(f"📦 Converting audio to base64...") | |
audio_base64 = base64.b64encode(audio_data).decode('utf-8') | |
log_info(f"📊 Base64 conversion complete: {len(audio_base64)} chars from {len(audio_data)} bytes", session_id=session.session.session_id) | |
# Log first 100 chars of base64 to verify it's valid | |
log_debug(f"🔍 Base64 preview: {audio_base64[:100]}...") | |
# Stream audio in chunks | |
chunk_size = 16384 # Larger chunk size for base64 | |
total_length = len(audio_base64) | |
total_chunks = (total_length + chunk_size - 1) // chunk_size | |
log_info(f"📤 Streaming TTS audio: {len(audio_data)} bytes as {total_length} base64 chars in {total_chunks} chunks", session_id=session.session.session_id) | |
for i in range(0, total_length, chunk_size): | |
# WebSocket aktif mi kontrol et | |
if not session.is_websocket_active: | |
log_warning(f"⚠️ WebSocket inactive during streaming, stopping", session_id=session.session.session_id) | |
break | |
chunk = audio_base64[i:i + chunk_size] | |
chunk_index = i // chunk_size | |
is_last = chunk_index == total_chunks - 1 | |
log_debug(f"📨 Sending chunk {chunk_index}/{total_chunks}, size: {len(chunk)}, is_last: {is_last}") | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "tts_audio", | |
"data": chunk, | |
"chunk_index": chunk_index, | |
"total_chunks": total_chunks, | |
"is_last": is_last, | |
"mime_type": "audio/mpeg" | |
}) | |
# Small delay to prevent overwhelming the client | |
await asyncio.sleep(0.01) | |
log_info( | |
f"✅ TTS streaming completed successfully", | |
session_id=session.session.session_id, | |
text_length=len(text), | |
audio_size=len(audio_data), | |
chunks_sent=total_chunks | |
) | |
# TTS bitimi - state değişimi process_user_input'ta yapılacak | |
except Exception as e: | |
error_msg = str(e) | |
log_error( | |
f"❌ TTS generation error", | |
error=error_msg, | |
traceback=traceback.format_exc(), | |
session_id=session.session.session_id | |
) | |
# Quota hatası için özel handling | |
if "quota_exceeded" in error_msg: | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "tts_error", | |
"message": "TTS servisinin kredi limiti aşıldı. Yanıt sadece metin olarak gösterilecek.", | |
"error_type": "quota_exceeded" | |
}) | |
else: | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "error", | |
"message": f"TTS error: {error_msg}" | |
}) | |
# TTS hatası durumunda listening'e dön | |
await session.change_state(ConversationState.LISTENING) | |
if session.is_websocket_active: | |
await websocket.send_json({ | |
"type": "state_change", | |
"from": "processing_tts", | |
"to": "listening" | |
}) | |
# STT'yi yeniden başlat | |
await session.restart_stt_if_needed() |