flare / tts_interface.py
ciyidogan's picture
Update tts_interface.py
c146995 verified
raw
history blame
4.81 kB
"""
TTS Interface and Implementations
"""
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any
import httpx
import os
from datetime import datetime
import sys
def log(message: str):
timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
print(f"[{timestamp}] {message}")
sys.stdout.flush()
class TTSInterface(ABC):
"""Abstract base class for TTS providers"""
@abstractmethod
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
"""
Convert text to speech and return audio bytes
Args:
text: Text to convert to speech
voice_id: Optional voice ID specific to the provider
**kwargs: Additional provider-specific parameters
Returns:
Audio data as bytes (MP3 or WAV format)
"""
pass
@abstractmethod
def get_supported_voices(self) -> Dict[str, str]:
"""Get list of supported voices"""
pass
class ElevenLabsTTS(TTSInterface):
"""ElevenLabs TTS implementation"""
def __init__(self, api_key: str):
self.api_key = api_key
self.base_url = "https://api.elevenlabs.io/v1"
self.default_voice_id = "2thYbn2sOGtiTwd9QwWH" # Avencia
# Debug log
masked_key = f"{api_key[:4]}...{api_key[-4:]}" if len(api_key) > 8 else "***"
log(f"πŸ”‘ ElevenLabsTTS initialized with key: {masked_key}")
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
"""Convert text to speech using ElevenLabs API"""
try:
voice = voice_id or self.default_voice_id
url = f"{self.base_url}/text-to-speech/{voice}"
headers = {
"xi-api-key": self.api_key,
"Content-Type": "application/json"
}
# Default parameters
data = {
"text": text,
"model_id": kwargs.get("model_id", "eleven_multilingual_v2"),
"voice_settings": kwargs.get("voice_settings", {
"stability": 1,
"similarity_boost": 0.85,
"style": 0.7,
"speed": 1.14,
"use_speaker_boost": True
})
}
# Add optional parameters
if "output_format" in kwargs:
params = {"output_format": kwargs["output_format"]}
else:
params = {"output_format": "mp3_44100_128"}
log(f"🎀 Calling ElevenLabs TTS for {len(text)} characters")
async with httpx.AsyncClient(timeout=30) as client:
response = await client.post(
url,
headers=headers,
json=data,
params=params
)
response.raise_for_status()
audio_data = response.content
log(f"βœ… ElevenLabs TTS returned {len(audio_data)} bytes")
return audio_data
except httpx.HTTPStatusError as e:
log(f"❌ ElevenLabs API error: {e.response.status_code} - {e.response.text}")
raise
except Exception as e:
log(f"❌ TTS synthesis error: {e}")
raise
def get_supported_voices(self) -> Dict[str, str]:
"""Get default voices - full list can be fetched from API"""
return {
"21m00Tcm4TlvDq8ikWAM": "Rachel (Female)",
"EXAVITQu4vr4xnSDxMaL": "Bella (Female)",
"ErXwobaYiN019PkySvjV": "Antoni (Male)",
"VR6AewLTigWG4xSOukaG": "Arnold (Male)",
"pNInz6obpgDQGcFmaJgB": "Adam (Male)",
"yoZ06aMxZJJ28mfd3POQ": "Sam (Male)",
}
class BlazeTTS(TTSInterface):
"""Placeholder for future Blaze TTS implementation"""
def __init__(self, api_key: str):
self.api_key = api_key
async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
raise NotImplementedError("Blaze TTS not implemented yet")
def get_supported_voices(self) -> Dict[str, str]:
return {}
def create_tts_provider(engine: str, api_key: Optional[str] = None) -> Optional[TTSInterface]:
"""Factory function to create TTS provider instances"""
if engine == "elevenlabs" and api_key:
return ElevenLabsTTS(api_key)
elif engine == "blaze" and api_key:
return BlazeTTS(api_key)
elif engine == "no_tts":
return None
else:
log(f"⚠️ Unknown or unconfigured TTS engine: {engine}")
return None