Spaces:
Building
Building
""" | |
TTS Interface and Implementations | |
""" | |
from abc import ABC, abstractmethod | |
from typing import Optional, Dict, Any | |
import httpx | |
import os | |
from datetime import datetime | |
import sys | |
def log(message: str): | |
timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3] | |
print(f"[{timestamp}] {message}") | |
sys.stdout.flush() | |
class TTSInterface(ABC): | |
"""Abstract base class for TTS providers""" | |
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes: | |
""" | |
Convert text to speech and return audio bytes | |
Args: | |
text: Text to convert to speech | |
voice_id: Optional voice ID specific to the provider | |
**kwargs: Additional provider-specific parameters | |
Returns: | |
Audio data as bytes (MP3 or WAV format) | |
""" | |
pass | |
def get_supported_voices(self) -> Dict[str, str]: | |
"""Get list of supported voices""" | |
pass | |
class ElevenLabsTTS(TTSInterface): | |
"""ElevenLabs TTS implementation""" | |
def __init__(self, api_key: str): | |
self.api_key = api_key | |
self.base_url = "https://api.elevenlabs.io/v1" | |
self.default_voice_id = "2thYbn2sOGtiTwd9QwWH" # Avencia | |
# Debug log | |
masked_key = f"{api_key[:4]}...{api_key[-4:]}" if len(api_key) > 8 else "***" | |
log(f"π ElevenLabsTTS initialized with key: {masked_key}") | |
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes: | |
"""Convert text to speech using ElevenLabs API""" | |
try: | |
voice = voice_id or self.default_voice_id | |
url = f"{self.base_url}/text-to-speech/{voice}" | |
headers = { | |
"xi-api-key": self.api_key, | |
"Content-Type": "application/json" | |
} | |
# Default parameters | |
data = { | |
"text": text, | |
"model_id": kwargs.get("model_id", "eleven_multilingual_v2"), | |
"voice_settings": kwargs.get("voice_settings", { | |
"stability": 1, | |
"similarity_boost": 0.85, | |
"style": 0.7, | |
"speed": 1.14, | |
"use_speaker_boost": True | |
}) | |
} | |
# Add optional parameters | |
if "output_format" in kwargs: | |
params = {"output_format": kwargs["output_format"]} | |
else: | |
params = {"output_format": "mp3_44100_128"} | |
log(f"π€ Calling ElevenLabs TTS for {len(text)} characters") | |
async with httpx.AsyncClient(timeout=30) as client: | |
response = await client.post( | |
url, | |
headers=headers, | |
json=data, | |
params=params | |
) | |
response.raise_for_status() | |
audio_data = response.content | |
log(f"β ElevenLabs TTS returned {len(audio_data)} bytes") | |
return audio_data | |
except httpx.HTTPStatusError as e: | |
log(f"β ElevenLabs API error: {e.response.status_code} - {e.response.text}") | |
raise | |
except Exception as e: | |
log(f"β TTS synthesis error: {e}") | |
raise | |
def get_supported_voices(self) -> Dict[str, str]: | |
"""Get default voices - full list can be fetched from API""" | |
return { | |
"21m00Tcm4TlvDq8ikWAM": "Rachel (Female)", | |
"EXAVITQu4vr4xnSDxMaL": "Bella (Female)", | |
"ErXwobaYiN019PkySvjV": "Antoni (Male)", | |
"VR6AewLTigWG4xSOukaG": "Arnold (Male)", | |
"pNInz6obpgDQGcFmaJgB": "Adam (Male)", | |
"yoZ06aMxZJJ28mfd3POQ": "Sam (Male)", | |
} | |
class BlazeTTS(TTSInterface): | |
"""Placeholder for future Blaze TTS implementation""" | |
def __init__(self, api_key: str): | |
self.api_key = api_key | |
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes: | |
raise NotImplementedError("Blaze TTS not implemented yet") | |
def get_supported_voices(self) -> Dict[str, str]: | |
return {} | |
def create_tts_provider(engine: str, api_key: Optional[str] = None) -> Optional[TTSInterface]: | |
"""Factory function to create TTS provider instances""" | |
if engine == "elevenlabs" and api_key: | |
return ElevenLabsTTS(api_key) | |
elif engine == "blaze" and api_key: | |
return BlazeTTS(api_key) | |
elif engine == "no_tts": | |
return None | |
else: | |
log(f"β οΈ Unknown or unconfigured TTS engine: {engine}") | |
return None |