Spaces:
Building
Building
""" | |
TTS Interface and Implementations | |
""" | |
from abc import ABC, abstractmethod | |
from typing import Optional, Dict, Any, Set | |
import httpx | |
import os | |
from datetime import datetime | |
import sys | |
from tts_preprocessor import TTSPreprocessor | |
def log(message: str): | |
timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3] | |
print(f"[{timestamp}] {message}") | |
sys.stdout.flush() | |
class TTSInterface(ABC): | |
"""Abstract base class for TTS providers""" | |
def __init__(self): | |
self.preprocessing_flags: Set[str] = set() | |
self.supports_ssml: bool = False | |
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes: | |
"""Convert text to speech and return audio bytes""" | |
pass | |
def get_supported_voices(self) -> Dict[str, str]: | |
"""Get list of supported voices""" | |
pass | |
def get_preprocessing_flags(self) -> Set[str]: | |
"""Get preprocessing flags for this provider""" | |
return self.preprocessing_flags | |
def supports_ssml_format(self) -> bool: | |
"""Check if provider supports SSML""" | |
return self.supports_ssml | |
class ElevenLabsTTS(TTSInterface): | |
"""ElevenLabs TTS implementation""" | |
def __init__(self, api_key: str): | |
super().__init__() | |
self.api_key = api_key | |
self.base_url = "https://api.elevenlabs.io/v1" | |
self.default_voice_id = "2thYbn2sOGtiTwd9QwWH" # Avencia | |
# ElevenLabs için preprocessing gereken alanlar | |
self.preprocessing_flags = { | |
TTSPreprocessor.PREPROCESS_NUMBERS, # Büyük sayılar | |
TTSPreprocessor.PREPROCESS_CURRENCY, # Para birimleri | |
TTSPreprocessor.PREPROCESS_TIME, # Saat formatı | |
TTSPreprocessor.PREPROCESS_CODES, # PNR kodları | |
TTSPreprocessor.PREPROCESS_PERCENTAGE # Yüzdeler | |
} | |
self.preprocessor = TTSPreprocessor(language="tr") | |
async def synthesize( | |
self, | |
text: str, | |
voice_id: Optional[str] = None, | |
model_id: Optional[str] = None, | |
output_format: Optional[str] = None, | |
**kwargs | |
) -> bytes: | |
"""Convert text to speech using ElevenLabs API""" | |
# Preprocess text | |
processed_text = self.preprocessor.preprocess(text, self.preprocessing_flags) | |
# Use defaults if not provided | |
voice_id = voice_id or self.default_voice_id | |
model_id = model_id or "eleven_multilingual_v2" | |
output_format = output_format or "mp3_44100_128" | |
url = f"{self.base_url}/text-to-speech/{voice_id}" | |
headers = { | |
"Accept": "audio/mpeg", | |
"Content-Type": "application/json", | |
"xi-api-key": self.api_key | |
} | |
data = { | |
"text": processed_text, | |
"model_id": model_id, | |
"voice_settings": { | |
"stability": 0.5, | |
"similarity_boost": 0.75, | |
"style": 0.0, | |
"use_speaker_boost": True | |
} | |
} | |
# Add output format to URL if specified | |
if output_format: | |
url += f"?output_format={output_format}" | |
try: | |
async with httpx.AsyncClient() as client: | |
log(f"🎤 ElevenLabs TTS request: voice={voice_id}, model={model_id}") | |
log(f"📝 Text (first 100 chars): {processed_text[:100]}...") | |
response = await client.post( | |
url, | |
json=data, | |
headers=headers, | |
timeout=30.0 | |
) | |
response.raise_for_status() | |
audio_data = response.content | |
log(f"✅ ElevenLabs TTS returned {len(audio_data)} bytes") | |
return audio_data | |
except httpx.HTTPStatusError as e: | |
log(f"❌ ElevenLabs API error: {e.response.status_code} - {e.response.text}") | |
raise | |
except Exception as e: | |
log(f"❌ TTS synthesis error: {e}") | |
raise | |
def get_supported_voices(self) -> Dict[str, str]: | |
"""Get default voices - full list can be fetched from API""" | |
return { | |
"21m00Tcm4TlvDq8ikWAM": "Rachel (Female)", | |
"EXAVITQu4vr4xnSDxMaL": "Bella (Female)", | |
"ErXwobaYiN019PkySvjV": "Antoni (Male)", | |
"VR6AewLTigWG4xSOukaG": "Arnold (Male)", | |
"pNInz6obpgDQGcFmaJgB": "Adam (Male)", | |
"yoZ06aMxZJJ28mfd3POQ": "Sam (Male)", | |
} | |
class BlazeTTS(TTSInterface): | |
"""Placeholder for future Blaze TTS implementation""" | |
def __init__(self, api_key: str): | |
super().__init__() | |
self.api_key = api_key | |
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes: | |
raise NotImplementedError("Blaze TTS not implemented yet") | |
def get_supported_voices(self) -> Dict[str, str]: | |
return {} | |
def create_tts_provider(engine: str, api_key: Optional[str] = None) -> Optional[TTSInterface]: | |
"""Factory function to create TTS provider instances""" | |
if engine == "elevenlabs" and api_key: | |
return ElevenLabsTTS(api_key) | |
elif engine == "blaze" and api_key: | |
return BlazeTTS(api_key) | |
elif engine == "no_tts": | |
return None | |
else: | |
log(f"⚠️ Unknown or unconfigured TTS engine: {engine}") | |
return None |