Spaces:
Building
Building
""" | |
ElevenLabs TTS Implementation | |
""" | |
import httpx | |
from typing import Optional, Dict | |
from .tts_interface import TTSInterface | |
from utils.logger import log_info, log_error, log_debug, log_warning | |
class ElevenLabsTTS(TTSInterface): | |
"""ElevenLabs TTS implementation""" | |
def __init__(self, api_key: str): | |
super().__init__() | |
self.api_key = api_key.strip() | |
self.base_url = "https://api.elevenlabs.io/v1" | |
self.default_voice_id = "2thYbn2sOGtiTwd9QwWH" # Avencia | |
# ElevenLabs preprocessing needs | |
self.preprocessing_flags = { | |
"PREPROCESS_NUMBERS", # Large numbers | |
"PREPROCESS_CURRENCY", # Currency amounts | |
"PREPROCESS_TIME", # Time format | |
"PREPROCESS_CODES", # PNR/codes | |
"PREPROCESS_PHONE" # Phone numbers | |
} | |
# Debug log | |
masked_key = f"{api_key[:4]}...{api_key[-4:]}" if len(api_key) > 8 else "***" | |
log_debug(f"π ElevenLabsTTS initialized with key: {masked_key}") | |
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes: | |
"""Convert text to speech using ElevenLabs API""" | |
try: | |
voice = voice_id or self.default_voice_id | |
url = f"{self.base_url}/text-to-speech/{voice}" | |
headers = { | |
"xi-api-key": self.api_key, | |
"Content-Type": "application/json" | |
} | |
# Default parameters | |
data = { | |
"text": text, | |
"model_id": kwargs.get("model_id", "eleven_multilingual_v2"), | |
"voice_settings": kwargs.get("voice_settings", { | |
"stability": 1, | |
"similarity_boost": 0.85, | |
"style": 0.7, | |
"speed": 1.14, | |
"use_speaker_boost": True | |
}) | |
} | |
# Add optional parameters | |
if "output_format" in kwargs: | |
params = {"output_format": kwargs["output_format"]} | |
else: | |
params = {"output_format": "mp3_44100_128"} | |
log_debug(f"π€ Calling ElevenLabs TTS for {len(text)} characters") | |
async with httpx.AsyncClient(timeout=30.0) as client: | |
response = await client.post( | |
url, | |
headers=headers, | |
json=data, | |
params=params | |
) | |
response.raise_for_status() | |
audio_data = response.content # This should be bytes | |
# Ensure we're returning bytes | |
if isinstance(audio_data, str): | |
log_warning("ElevenLabs returned string instead of bytes") | |
# Try to decode if it's base64 | |
try: | |
audio_data = base64.b64decode(audio_data) | |
except: | |
pass | |
log_debug(f"β ElevenLabs TTS returned {len(audio_data)} bytes") | |
log_debug(f"Audio data type: {type(audio_data)}") | |
return audio_data | |
except httpx.HTTPStatusError as e: | |
log_error(f"β ElevenLabs API error: {e.response.status_code} - {e.response.text}") | |
raise | |
except Exception as e: | |
log_error("β TTS synthesis error", e) | |
raise | |
def get_supported_voices(self) -> Dict[str, str]: | |
"""Get default voices - full list can be fetched from API""" | |
return { | |
"2thYbn2sOGtiTwd9QwWH": "Avencia (Female - Turkish)", | |
"21m00Tcm4TlvDq8ikWAM": "Rachel (Female)", | |
"EXAVITQu4vr4xnSDxMaL": "Bella (Female)", | |
"ErXwobaYiN019PkySvjV": "Antoni (Male)", | |
"VR6AewLTigWG4xSOukaG": "Arnold (Male)", | |
"pNInz6obpgDQGcFmaJgB": "Adam (Male)", | |
"yoZ06aMxZJJ28mfd3POQ": "Sam (Male)", | |
} | |
def get_provider_name(self) -> str: | |
"""Get provider name""" | |
return "elevenlabs" |