File size: 8,417 Bytes
bea4a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d32118
bea4a17
 
5d32118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bea4a17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263

"""Audio API endpoints for Flare
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Provides text-to-speech (TTS) and speech-to-text (STT) endpoints.
"""

from fastapi import APIRouter, HTTPException, Response, Body
from pydantic import BaseModel
from typing import Optional
from datetime import datetime
import sys

from logger import log_info, log_error, log_warning, log_debug
from tts_factory import TTSFactory
from tts_preprocessor import TTSPreprocessor
from config_provider import ConfigProvider

router = APIRouter(tags=["audio"])

# ===================== Models =====================
class TTSRequest(BaseModel):
    text: str
    voice_id: Optional[str] = None
    language: Optional[str] = "tr-TR"

class STTRequest(BaseModel):
    audio_data: str  # Base64 encoded audio
    language: Optional[str] = "tr-TR"
    format: Optional[str] = "webm"  # webm, wav, mp3

# ===================== Helpers =====================
def log(message: str):
    """Log helper with timestamp"""
    timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
    print(f"[{timestamp}] {message}")
    sys.stdout.flush()

# ===================== TTS Endpoints =====================
@router.post("/tts/generate")
async def generate_tts(request: TTSRequest):
    """Generate TTS audio from text - public endpoint for chat"""
    try:
        # Create TTS provider
        tts_provider = TTSFactory.create_provider()
        
        if not tts_provider:
            # Return empty response for no TTS
            log_info("πŸ“΅ TTS disabled - returning empty response")
            return Response(
                content=b"",
                media_type="audio/mpeg",
                headers={"X-TTS-Status": "disabled"}
            )
        
        log_info(f"🎀 TTS request: '{request.text[:50]}...' with provider: {tts_provider.get_provider_name()}")
        
        # Preprocess text if needed
        preprocessor = TTSPreprocessor(language=request.language)
        processed_text = preprocessor.preprocess(
            request.text,
            tts_provider.get_preprocessing_flags()
        )
        
        log_debug(f"πŸ“ Preprocessed text: {processed_text[:100]}...")
        
        # Generate audio
        audio_data = await tts_provider.synthesize(
            text=processed_text,
            voice_id=request.voice_id
        )
        
        log_info(f"βœ… TTS generated {len(audio_data)} bytes of audio")
        
        # Return audio as binary response
        return Response(
            content=audio_data,
            media_type="audio/mpeg",
            headers={
                "Content-Disposition": 'inline; filename="tts_output.mp3"',
                "X-TTS-Provider": tts_provider.get_provider_name(),
                "X-TTS-Language": request.language,
                "Cache-Control": "no-cache"
            }
        )
        
    except Exception as e:
        log_error("❌ TTS generation error", e)
        raise HTTPException(
            status_code=500, 
            detail=f"TTS generation failed: {str(e)}"
        )

@router.get("/tts/voices")
async def get_tts_voices():
    """Get available TTS voices - public endpoint"""
    try:
        tts_provider = TTSFactory.create_provider()
        
        if not tts_provider:
            return {
                "voices": [],
                "provider": "none",
                "enabled": False
            }
        
        voices = tts_provider.get_supported_voices()
        
        # Convert dict to list format
        voice_list = [
            {"id": voice_id, "name": voice_name}
            for voice_id, voice_name in voices.items()
        ]
        
        return {
            "voices": voice_list,
            "provider": tts_provider.get_provider_name(),
            "enabled": True
        }
        
    except Exception as e:
        log_error("❌ Error getting TTS voices", e)
        return {
            "voices": [],
            "provider": "error",
            "enabled": False,
            "error": str(e)
        }

@router.get("/tts/status")
async def get_tts_status():
    """Get TTS service status"""
    cfg = ConfigProvider.get()
    
    return {
        "enabled": cfg.global_config.tts_provider.name != "no_tts",
        "provider": cfg.global_config.tts_provider.name,
        "provider_config": {
            "name": cfg.global_config.tts_provider.name,
            "has_api_key": bool(cfg.global_config.tts_provider.api_key),
            "endpoint": cfg.global_config.tts_provider.endpoint
        }
    }

# ===================== STT Endpoints =====================
@router.post("/stt/transcribe")
async def transcribe_audio(request: STTRequest):
    """Transcribe audio to text"""
    try:
        from stt_factory import STTFactory
        from stt_interface import STTConfig
        import base64
        
        # Create STT provider
        stt_provider = STTFactory.create_provider()
        
        if not stt_provider or not stt_provider.supports_realtime():
            log_warning("πŸ“΅ STT disabled or doesn't support transcription")
            raise HTTPException(
                status_code=503,
                detail="STT service not available"
            )
        
        # Get config
        cfg = ConfigProvider.get()
        stt_config = cfg.global_config.stt_provider.settings
        
        # Decode audio data
        audio_bytes = base64.b64decode(request.audio_data)
        
        # Create STT config
        config = STTConfig(
            language=request.language or stt_config.get("language", "tr-TR"),
            sample_rate=16000,
            encoding=request.format.upper() if request.format else "WEBM_OPUS",
            enable_punctuation=stt_config.get("enable_punctuation", True),
            enable_word_timestamps=False,
            model=stt_config.get("model", "latest_long"),
            use_enhanced=stt_config.get("use_enhanced", True),
            single_utterance=True,
            interim_results=False
        )
        
        # Start streaming session
        await stt_provider.start_streaming(config)
        
        # Process audio
        transcription = ""
        confidence = 0.0
        
        try:
            async for result in stt_provider.stream_audio(audio_bytes):
                if result.is_final:
                    transcription = result.text
                    confidence = result.confidence
                    break
        finally:
            # Stop streaming
            await stt_provider.stop_streaming()
        
        log_info(f"βœ… STT transcription completed: '{transcription[:50]}...'")
        
        return {
            "text": transcription,
            "confidence": confidence,
            "language": request.language,
            "provider": stt_provider.get_provider_name()
        }
        
    except HTTPException:
        raise
    except Exception as e:
        log_error("❌ STT transcription error", e)
        raise HTTPException(
            status_code=500,
            detail=f"Transcription failed: {str(e)}"
        )

@router.get("/stt/languages")
async def get_stt_languages():
    """Get supported STT languages"""
    try:
        from stt_factory import STTFactory
        
        stt_provider = STTFactory.create_provider()
        
        if not stt_provider:
            return {
                "languages": [],
                "provider": "none",
                "enabled": False
            }
        
        languages = stt_provider.get_supported_languages()
        
        return {
            "languages": languages,
            "provider": stt_provider.get_provider_name(),
            "enabled": True
        }
        
    except Exception as e:
        log_error("❌ Error getting STT languages", e)
        return {
            "languages": [],
            "provider": "error",
            "enabled": False,
            "error": str(e)
        }

@router.get("/stt/status")
async def get_stt_status():
    """Get STT service status"""
    cfg = ConfigProvider.get()
    
    return {
        "enabled": cfg.global_config.stt_provider.name != "no_stt",
        "provider": cfg.global_config.stt_provider.name,
        "provider_config": {
            "name": cfg.global_config.stt_provider.name,
            "has_api_key": bool(cfg.global_config.stt_provider.api_key),
            "endpoint": cfg.global_config.stt_provider.endpoint
        }
    }