File size: 5,680 Bytes
f563475
 
 
 
 
b861704
f563475
 
 
 
 
a879708
 
f563475
 
 
 
 
 
 
 
d6da344
 
 
 
f563475
 
d6da344
f563475
 
 
 
 
 
d6da344
 
 
b861704
d6da344
b861704
d6da344
 
f563475
 
 
 
 
b861704
b5d0779
f563475
c146995
e6f59c3
7368651
 
 
 
 
 
 
 
 
b5d0779
7368651
b861704
 
 
 
 
 
 
 
 
f563475
b861704
 
559bf23
b861704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773eda6
f563475
b861704
 
 
 
 
 
 
 
 
 
 
f563475
b861704
 
f563475
b861704
f563475
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b861704
f563475
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
TTS Interface and Implementations
"""

from abc import ABC, abstractmethod
from typing import Optional, Dict, Any, Set
import httpx
import os
from datetime import datetime
import sys

from tts_preprocessor import TTSPreprocessor

def log(message: str):
    timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
    print(f"[{timestamp}] {message}")
    sys.stdout.flush()

class TTSInterface(ABC):
    """Abstract base class for TTS providers"""
    
    def __init__(self):
        self.preprocessing_flags: Set[str] = set()
        self.supports_ssml: bool = False
        
    @abstractmethod
    async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
        """Convert text to speech and return audio bytes"""
        pass
    
    @abstractmethod
    def get_supported_voices(self) -> Dict[str, str]:
        """Get list of supported voices"""
        pass
    
    def get_preprocessing_flags(self) -> Set[str]:
        """Get preprocessing flags for this provider"""
        return self.preprocessing_flags
    
    def supports_ssml_format(self) -> bool:
        """Check if provider supports SSML"""
        return self.supports_ssml

class ElevenLabsTTS(TTSInterface):
    """ElevenLabs TTS implementation"""
    
    def __init__(self, api_key: str):
        super().__init__()
        self.api_key = api_key.strip()  # Başındaki/sonundaki boşlukları temizle
        self.base_url = "https://api.elevenlabs.io/v1"
        self.default_voice_id = "2thYbn2sOGtiTwd9QwWH" # Avencia

        # ElevenLabs için preprocessing gereken alanlar
        self.preprocessing_flags = {
            TTSPreprocessor.PREPROCESS_NUMBERS,    # Büyük sayılar
            TTSPreprocessor.PREPROCESS_CURRENCY,   # Para birimleri
            TTSPreprocessor.PREPROCESS_TIME,       # Saat formatı
            TTSPreprocessor.PREPROCESS_CODES,      # PNR kodları
            TTSPreprocessor.PREPROCESS_PERCENTAGE  # Yüzdeler
        }
        
        # tr-TR -> tr dönüşümü
        self.preprocessor = TTSPreprocessor(language="tr")
    
    async def synthesize(
        self, 
        text: str, 
        voice_id: Optional[str] = None,
        model_id: Optional[str] = None,
        output_format: Optional[str] = None,
        **kwargs
    ) -> bytes:
        """Convert text to speech using ElevenLabs API"""
        
        # Preprocess text
        processed_text = self.preprocessor.preprocess(text, self.preprocessing_flags)
        
        # Use defaults if not provided
        voice_id = voice_id or self.default_voice_id
        model_id = model_id or "eleven_multilingual_v2"
        output_format = output_format or "mp3_44100_128"
        
        url = f"{self.base_url}/text-to-speech/{voice_id}"
        
        headers = {
            "Accept": "audio/mpeg",
            "Content-Type": "application/json",
            "xi-api-key": self.api_key
        }
        
        data = {
            "text": processed_text,
            "model_id": model_id,
            "voice_settings": {
                "stability": 0.5,
                "similarity_boost": 0.75,
                "style": 0.0,
                "use_speaker_boost": True
            }
        }
        
        # Add output format to URL if specified
        if output_format:
            url += f"?output_format={output_format}"
        
        try:
            async with httpx.AsyncClient() as client:
                log(f"🎤 ElevenLabs TTS request: voice={voice_id}, model={model_id}")
                log(f"📝 Text (first 100 chars): {processed_text[:100]}...")
                
                response = await client.post(
                    url, 
                    json=data, 
                    headers=headers,
                    timeout=30.0
                )
                
                response.raise_for_status()
                audio_data = response.content
                
                log(f"✅ ElevenLabs TTS returned {len(audio_data)} bytes")
                return audio_data
                
        except httpx.HTTPStatusError as e:
            log(f"❌ ElevenLabs API error: {e.response.status_code} - {e.response.text}")
            raise
        except Exception as e:
            log(f"❌ TTS synthesis error: {e}")
            raise
    
    def get_supported_voices(self) -> Dict[str, str]:
        """Get default voices - full list can be fetched from API"""
        return {
            "21m00Tcm4TlvDq8ikWAM": "Rachel (Female)",
            "EXAVITQu4vr4xnSDxMaL": "Bella (Female)",
            "ErXwobaYiN019PkySvjV": "Antoni (Male)",
            "VR6AewLTigWG4xSOukaG": "Arnold (Male)",
            "pNInz6obpgDQGcFmaJgB": "Adam (Male)",
            "yoZ06aMxZJJ28mfd3POQ": "Sam (Male)",
        }

class BlazeTTS(TTSInterface):
    """Placeholder for future Blaze TTS implementation"""
    
    def __init__(self, api_key: str):
        super().__init__()
        self.api_key = api_key
        
    async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
        raise NotImplementedError("Blaze TTS not implemented yet")
    
    def get_supported_voices(self) -> Dict[str, str]:
        return {}


def create_tts_provider(engine: str, api_key: Optional[str] = None) -> Optional[TTSInterface]:
    """Factory function to create TTS provider instances"""
    if engine == "elevenlabs" and api_key:
        return ElevenLabsTTS(api_key)
    elif engine == "blaze" and api_key:
        return BlazeTTS(api_key)
    elif engine == "no_tts":
        return None
    else:
        log(f"⚠️ Unknown or unconfigured TTS engine: {engine}")
        return None