File size: 2,976 Bytes
0925810
f655f69
0925810
 
c2fa877
f655f69
0925810
 
 
f655f69
 
95849c2
0925810
c2fa877
0925810
 
c2fa877
 
 
 
 
 
 
 
 
 
 
 
9281119
 
 
 
c2fa877
 
 
 
 
 
95849c2
f655f69
95849c2
 
 
 
f655f69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9281119
f655f69
 
9281119
 
c2fa877
f655f69
 
 
c2fa877
0925810
 
 
9281119
 
 
f655f69
 
9281119
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import typing as t
from copy import deepcopy

from dotenv import load_dotenv
from elevenlabs import VoiceSettings
from elevenlabs.client import AsyncElevenLabs

load_dotenv()

from src.config import ELEVENLABS_API_KEY, logger
from src.schemas import SoundEffectsParams, TTSParams, TTSTimestampsResponse
from src.utils import auto_retry

ELEVEN_CLIENT_ASYNC = AsyncElevenLabs(api_key=ELEVENLABS_API_KEY)


async def tts_astream(
    voice_id: str, text: str, params: dict | None = None
) -> t.AsyncIterator[bytes]:
    params_all = dict(voice_id=voice_id, text=text)

    if params is not None:
        params_all["voice_settings"] = VoiceSettings(  # type: ignore
            stability=params.get("stability"),
            similarity_boost=params.get("similarity_boost"),
            style=params.get("style"),
        )

    logger.info(
        f"request to 11labs TTS endpoint with params {params_all} "
        f'for the following text: "{text}"'
    )
    async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(**params_all)
    async for chunk in async_iter:
        if chunk:
            yield chunk


@auto_retry
async def tts_astream_consumed(voice_id: str, text: str, params: dict | None = None) -> list[bytes]:
    aiterator = tts_astream(voice_id=voice_id, text=text, params=params)
    return [x async for x in aiterator]


@auto_retry
async def tts_w_timestamps(params: TTSParams) -> TTSTimestampsResponse:
    async def _tts_w_timestamps(params: TTSParams) -> TTSTimestampsResponse:
        # NOTE: we need to use special `to_dict()` method to ensure pydantic model is converted
        # to dict with proper aliases
        params_dict = params.to_dict()

        params_no_text = deepcopy(params_dict)
        text = params_no_text.pop('text')
        logger.info(
            f"request to 11labs TTS endpoint with params {params_no_text} "
            f'for the following text: "{text}"'
        )

        response_raw = await ELEVEN_CLIENT_ASYNC.text_to_speech.convert_with_timestamps(
            **params_dict
        )

        response_parsed = TTSTimestampsResponse.model_validate(response_raw)
        return response_parsed

    res = await _tts_w_timestamps(params=params)
    return res


async def sound_generation_astream(params: SoundEffectsParams) -> t.AsyncIterator[bytes]:
    params_no_text = params.model_dump(exclude={"text"})
    logger.info(
        f"request to 11labs sound effect generation with params {params_no_text} "
        f'for the following text: "{params.text}"'
    )

    async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
        text=params.text,
        duration_seconds=params.duration_seconds,
        prompt_influence=params.prompt_influence,
    )
    async for chunk in async_iter:
        if chunk:
            yield chunk


@auto_retry
async def sound_generation_consumed(params: SoundEffectsParams):
    aiterator = sound_generation_astream(params=params)
    return [x async for x in aiterator]