File size: 2,632 Bytes
babeaf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import asyncio
import os
import types
import httpx

from realtime_ai_character.logger import get_logger
from realtime_ai_character.utils import Singleton
from realtime_ai_character.audio.text_to_speech.base import TextToSpeech

logger = get_logger(__name__)
DEBUG = False

config = types.SimpleNamespace(**{
    'default_voice': '21m00Tcm4TlvDq8ikWAM',
    'default_female_voice': 'EXAVITQu4vr4xnSDxMaL',
    'default_male_voice': 'ErXwobaYiN019PkySvjV',
    'chunk_size': 1024,
    'url': 'https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream',
    'headers': {
        'Accept': 'audio/mpeg',
        'Content-Type': 'application/json',
        'xi-api-key': os.environ['ELEVEN_LABS_API_KEY']
    },
    'data': {
        'model_id': 'eleven_monolingual_v1',
        'voice_settings': {
            'stability': 0.5,
            'similarity_boost': 0.75
        }
    }
})


class ElevenLabs(Singleton, TextToSpeech):
    def __init__(self):
        super().__init__()
        logger.info("Initializing [ElevenLabs Text To Speech] voices...")
        self.voice_ids = {
            "Raiden Shogun And Ei": os.environ.get('RAIDEN_VOICE') or config.default_female_voice,
            "Loki": os.environ.get('LOKI_VOICE') or config.default_male_voice,
            "Reflection Pi": os.environ.get('PI_VOICE') or config.default_female_voice,
            "Elon Musk": os.environ.get('ELON_VOICE') or config.default_male_voice,
            "Bruce Wayne": os.environ.get('BRUCE_VOICE') or config.default_male_voice,
            "Steve Jobs": os.environ.get('JOBS_VOICE') or config.default_male_voice,
            "Sam Altman": os.environ.get('SAM_VOICE') or config.default_male_voice,
        }

    def get_voice_id(self, name):
        return self.voice_ids.get(name, config.default_voice)

    async def stream(self, text, websocket, tts_event: asyncio.Event, characater_name="", first_sentence=False) -> None:
        if DEBUG:
            return
        headers = config.headers
        data = {
            "text": text,
            **config.data,
        }
        voice_id = self.get_voice_id(characater_name)
        url = config.url.format(voice_id=voice_id)
        if first_sentence:
            url = url + '?optimize_streaming_latency=4'
        async with httpx.AsyncClient() as client:
            response = await client.post(url, json=data, headers=headers)
            async for chunk in response.aiter_bytes():
                await asyncio.sleep(0.1)
                if tts_event.is_set():
                    # stop streaming audio
                    break
                await websocket.send_bytes(chunk)