File size: 4,811 Bytes
f563475
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c146995
e6f59c3
 
 
 
f563475
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f2ca6a
f563475
909ae7c
c146995
 
909ae7c
773eda6
f563475
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
TTS Interface and Implementations
"""

from abc import ABC, abstractmethod
from typing import Optional, Dict, Any
import httpx
import os
from datetime import datetime
import sys

def log(message: str):
    timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
    print(f"[{timestamp}] {message}")
    sys.stdout.flush()

class TTSInterface(ABC):
    """Abstract base class for TTS providers"""
    
    @abstractmethod
    async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
        """
        Convert text to speech and return audio bytes
        
        Args:
            text: Text to convert to speech
            voice_id: Optional voice ID specific to the provider
            **kwargs: Additional provider-specific parameters
            
        Returns:
            Audio data as bytes (MP3 or WAV format)
        """
        pass
    
    @abstractmethod
    def get_supported_voices(self) -> Dict[str, str]:
        """Get list of supported voices"""
        pass


class ElevenLabsTTS(TTSInterface):
    """ElevenLabs TTS implementation"""
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.elevenlabs.io/v1"
        self.default_voice_id = "2thYbn2sOGtiTwd9QwWH" # Avencia

                # Debug log
        masked_key = f"{api_key[:4]}...{api_key[-4:]}" if len(api_key) > 8 else "***"
        log(f"πŸ”‘ ElevenLabsTTS initialized with key: {masked_key}")
        
    async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
        """Convert text to speech using ElevenLabs API"""
        try:
            voice = voice_id or self.default_voice_id
            url = f"{self.base_url}/text-to-speech/{voice}"
            
            headers = {
                "xi-api-key": self.api_key,
                "Content-Type": "application/json"
            }
            
            # Default parameters
            data = {
                "text": text,
                "model_id": kwargs.get("model_id", "eleven_multilingual_v2"),
                "voice_settings": kwargs.get("voice_settings", {
                "stability": 1,
                "similarity_boost": 0.85, 
                "style": 0.7,             
                "speed": 1.14,
                "use_speaker_boost": True
                })
            }
            
            # Add optional parameters
            if "output_format" in kwargs:
                params = {"output_format": kwargs["output_format"]}
            else:
                params = {"output_format": "mp3_44100_128"}
            
            log(f"🎀 Calling ElevenLabs TTS for {len(text)} characters")
            
            async with httpx.AsyncClient(timeout=30) as client:
                response = await client.post(
                    url,
                    headers=headers,
                    json=data,
                    params=params
                )
                
                response.raise_for_status()
                audio_data = response.content
                
                log(f"βœ… ElevenLabs TTS returned {len(audio_data)} bytes")
                return audio_data
                
        except httpx.HTTPStatusError as e:
            log(f"❌ ElevenLabs API error: {e.response.status_code} - {e.response.text}")
            raise
        except Exception as e:
            log(f"❌ TTS synthesis error: {e}")
            raise
    
    def get_supported_voices(self) -> Dict[str, str]:
        """Get default voices - full list can be fetched from API"""
        return {
            "21m00Tcm4TlvDq8ikWAM": "Rachel (Female)",
            "EXAVITQu4vr4xnSDxMaL": "Bella (Female)",
            "ErXwobaYiN019PkySvjV": "Antoni (Male)",
            "VR6AewLTigWG4xSOukaG": "Arnold (Male)",
            "pNInz6obpgDQGcFmaJgB": "Adam (Male)",
            "yoZ06aMxZJJ28mfd3POQ": "Sam (Male)",
        }


class BlazeTTS(TTSInterface):
    """Placeholder for future Blaze TTS implementation"""
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        
    async def synthesize(self, text: str, voice_id: Optional[str] = None, **kwargs) -> bytes:
        raise NotImplementedError("Blaze TTS not implemented yet")
    
    def get_supported_voices(self) -> Dict[str, str]:
        return {}


def create_tts_provider(engine: str, api_key: Optional[str] = None) -> Optional[TTSInterface]:
    """Factory function to create TTS provider instances"""
    if engine == "elevenlabs" and api_key:
        return ElevenLabsTTS(api_key)
    elif engine == "blaze" and api_key:
        return BlazeTTS(api_key)
    elif engine == "no_tts":
        return None
    else:
        log(f"⚠️ Unknown or unconfigured TTS engine: {engine}")
        return None