Spaces:

ales
/

ai-audio-books

Sleeping

File size: 2,976 Bytes

import typing as t
from copy import deepcopy

from dotenv import load_dotenv
from elevenlabs import VoiceSettings
from elevenlabs.client import AsyncElevenLabs

load_dotenv()

from src.config import ELEVENLABS_API_KEY, logger
from src.schemas import SoundEffectsParams, TTSParams, TTSTimestampsResponse
from src.utils import auto_retry

ELEVEN_CLIENT_ASYNC = AsyncElevenLabs(api_key=ELEVENLABS_API_KEY)


async def tts_astream(
    voice_id: str, text: str, params: dict | None = None
) -> t.AsyncIterator[bytes]:
    params_all = dict(voice_id=voice_id, text=text)

    if params is not None:
        params_all["voice_settings"] = VoiceSettings(  # type: ignore
            stability=params.get("stability"),
            similarity_boost=params.get("similarity_boost"),
            style=params.get("style"),
        )

    logger.info(
        f"request to 11labs TTS endpoint with params {params_all} "
        f'for the following text: "{text}"'
    )
    async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(**params_all)
    async for chunk in async_iter:
        if chunk:
            yield chunk


@auto_retry
async def tts_astream_consumed(voice_id: str, text: str, params: dict | None = None) -> list[bytes]:
    aiterator = tts_astream(voice_id=voice_id, text=text, params=params)
    return [x async for x in aiterator]


@auto_retry
async def tts_w_timestamps(params: TTSParams) -> TTSTimestampsResponse:
    async def _tts_w_timestamps(params: TTSParams) -> TTSTimestampsResponse:
        # NOTE: we need to use special `to_dict()` method to ensure pydantic model is converted
        # to dict with proper aliases
        params_dict = params.to_dict()

        params_no_text = deepcopy(params_dict)
        text = params_no_text.pop('text')
        logger.info(
            f"request to 11labs TTS endpoint with params {params_no_text} "
            f'for the following text: "{text}"'
        )

        response_raw = await ELEVEN_CLIENT_ASYNC.text_to_speech.convert_with_timestamps(
            **params_dict
        )

        response_parsed = TTSTimestampsResponse.model_validate(response_raw)
        return response_parsed

    res = await _tts_w_timestamps(params=params)
    return res


async def sound_generation_astream(params: SoundEffectsParams) -> t.AsyncIterator[bytes]:
    params_no_text = params.model_dump(exclude={"text"})
    logger.info(
        f"request to 11labs sound effect generation with params {params_no_text} "
        f'for the following text: "{params.text}"'
    )

    async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
        text=params.text,
        duration_seconds=params.duration_seconds,
        prompt_influence=params.prompt_influence,
    )
    async for chunk in async_iter:
        if chunk:
            yield chunk


@auto_retry
async def sound_generation_consumed(params: SoundEffectsParams):
    aiterator = sound_generation_astream(params=params)
    return [x async for x in aiterator]