Spaces:
Building
Building
File size: 5,680 Bytes
f563475 b861704 f563475 a879708 f563475 d6da344 f563475 d6da344 f563475 d6da344 b861704 d6da344 b861704 d6da344 f563475 b861704 b5d0779 f563475 c146995 e6f59c3 7368651 b5d0779 7368651 b861704 f563475 b861704 559bf23 b861704 773eda6 f563475 b861704 f563475 b861704 f563475 b861704 f563475 b861704 f563475 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
"""
TTS Interface and Implementations
"""
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any, Set
import httpx
import os
from datetime import datetime
import sys
from tts_preprocessor import TTSPreprocessor
def log(message: str):
timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
print(f"[{timestamp}] {message}")
sys.stdout.flush()
class TTSInterface(ABC):
"""Abstract base class for TTS providers"""
def __init__(self):
self.preprocessing_flags: Set[str] = set()
self.supports_ssml: bool = False
@abstractmethod
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
"""Convert text to speech and return audio bytes"""
pass
@abstractmethod
def get_supported_voices(self) -> Dict[str, str]:
"""Get list of supported voices"""
pass
def get_preprocessing_flags(self) -> Set[str]:
"""Get preprocessing flags for this provider"""
return self.preprocessing_flags
def supports_ssml_format(self) -> bool:
"""Check if provider supports SSML"""
return self.supports_ssml
class ElevenLabsTTS(TTSInterface):
"""ElevenLabs TTS implementation"""
def __init__(self, api_key: str):
super().__init__()
self.api_key = api_key.strip() # Başındaki/sonundaki boşlukları temizle
self.base_url = "https://api.elevenlabs.io/v1"
self.default_voice_id = "2thYbn2sOGtiTwd9QwWH" # Avencia
# ElevenLabs için preprocessing gereken alanlar
self.preprocessing_flags = {
TTSPreprocessor.PREPROCESS_NUMBERS, # Büyük sayılar
TTSPreprocessor.PREPROCESS_CURRENCY, # Para birimleri
TTSPreprocessor.PREPROCESS_TIME, # Saat formatı
TTSPreprocessor.PREPROCESS_CODES, # PNR kodları
TTSPreprocessor.PREPROCESS_PERCENTAGE # Yüzdeler
}
# tr-TR -> tr dönüşümü
self.preprocessor = TTSPreprocessor(language="tr")
async def synthesize(
self,
text: str,
voice_id: Optional[str] = None,
model_id: Optional[str] = None,
output_format: Optional[str] = None,
**kwargs
) -> bytes:
"""Convert text to speech using ElevenLabs API"""
# Preprocess text
processed_text = self.preprocessor.preprocess(text, self.preprocessing_flags)
# Use defaults if not provided
voice_id = voice_id or self.default_voice_id
model_id = model_id or "eleven_multilingual_v2"
output_format = output_format or "mp3_44100_128"
url = f"{self.base_url}/text-to-speech/{voice_id}"
headers = {
"Accept": "audio/mpeg",
"Content-Type": "application/json",
"xi-api-key": self.api_key
}
data = {
"text": processed_text,
"model_id": model_id,
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.75,
"style": 0.0,
"use_speaker_boost": True
}
}
# Add output format to URL if specified
if output_format:
url += f"?output_format={output_format}"
try:
async with httpx.AsyncClient() as client:
log(f"🎤 ElevenLabs TTS request: voice={voice_id}, model={model_id}")
log(f"📝 Text (first 100 chars): {processed_text[:100]}...")
response = await client.post(
url,
json=data,
headers=headers,
timeout=30.0
)
response.raise_for_status()
audio_data = response.content
log(f"✅ ElevenLabs TTS returned {len(audio_data)} bytes")
return audio_data
except httpx.HTTPStatusError as e:
log(f"❌ ElevenLabs API error: {e.response.status_code} - {e.response.text}")
raise
except Exception as e:
log(f"❌ TTS synthesis error: {e}")
raise
def get_supported_voices(self) -> Dict[str, str]:
"""Get default voices - full list can be fetched from API"""
return {
"21m00Tcm4TlvDq8ikWAM": "Rachel (Female)",
"EXAVITQu4vr4xnSDxMaL": "Bella (Female)",
"ErXwobaYiN019PkySvjV": "Antoni (Male)",
"VR6AewLTigWG4xSOukaG": "Arnold (Male)",
"pNInz6obpgDQGcFmaJgB": "Adam (Male)",
"yoZ06aMxZJJ28mfd3POQ": "Sam (Male)",
}
class BlazeTTS(TTSInterface):
"""Placeholder for future Blaze TTS implementation"""
def __init__(self, api_key: str):
super().__init__()
self.api_key = api_key
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
raise NotImplementedError("Blaze TTS not implemented yet")
def get_supported_voices(self) -> Dict[str, str]:
return {}
def create_tts_provider(engine: str, api_key: Optional[str] = None) -> Optional[TTSInterface]:
"""Factory function to create TTS provider instances"""
if engine == "elevenlabs" and api_key:
return ElevenLabsTTS(api_key)
elif engine == "blaze" and api_key:
return BlazeTTS(api_key)
elif engine == "no_tts":
return None
else:
log(f"⚠️ Unknown or unconfigured TTS engine: {engine}")
return None |