Spaces:
Sleeping
Sleeping
File size: 2,976 Bytes
0925810 f655f69 0925810 c2fa877 f655f69 0925810 f655f69 95849c2 0925810 c2fa877 0925810 c2fa877 9281119 c2fa877 95849c2 f655f69 95849c2 f655f69 9281119 f655f69 9281119 c2fa877 f655f69 c2fa877 0925810 9281119 f655f69 9281119 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import typing as t
from copy import deepcopy
from dotenv import load_dotenv
from elevenlabs import VoiceSettings
from elevenlabs.client import AsyncElevenLabs
load_dotenv()
from src.config import ELEVENLABS_API_KEY, logger
from src.schemas import SoundEffectsParams, TTSParams, TTSTimestampsResponse
from src.utils import auto_retry
ELEVEN_CLIENT_ASYNC = AsyncElevenLabs(api_key=ELEVENLABS_API_KEY)
async def tts_astream(
voice_id: str, text: str, params: dict | None = None
) -> t.AsyncIterator[bytes]:
params_all = dict(voice_id=voice_id, text=text)
if params is not None:
params_all["voice_settings"] = VoiceSettings( # type: ignore
stability=params.get("stability"),
similarity_boost=params.get("similarity_boost"),
style=params.get("style"),
)
logger.info(
f"request to 11labs TTS endpoint with params {params_all} "
f'for the following text: "{text}"'
)
async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(**params_all)
async for chunk in async_iter:
if chunk:
yield chunk
@auto_retry
async def tts_astream_consumed(voice_id: str, text: str, params: dict | None = None) -> list[bytes]:
aiterator = tts_astream(voice_id=voice_id, text=text, params=params)
return [x async for x in aiterator]
@auto_retry
async def tts_w_timestamps(params: TTSParams) -> TTSTimestampsResponse:
async def _tts_w_timestamps(params: TTSParams) -> TTSTimestampsResponse:
# NOTE: we need to use special `to_dict()` method to ensure pydantic model is converted
# to dict with proper aliases
params_dict = params.to_dict()
params_no_text = deepcopy(params_dict)
text = params_no_text.pop('text')
logger.info(
f"request to 11labs TTS endpoint with params {params_no_text} "
f'for the following text: "{text}"'
)
response_raw = await ELEVEN_CLIENT_ASYNC.text_to_speech.convert_with_timestamps(
**params_dict
)
response_parsed = TTSTimestampsResponse.model_validate(response_raw)
return response_parsed
res = await _tts_w_timestamps(params=params)
return res
async def sound_generation_astream(params: SoundEffectsParams) -> t.AsyncIterator[bytes]:
params_no_text = params.model_dump(exclude={"text"})
logger.info(
f"request to 11labs sound effect generation with params {params_no_text} "
f'for the following text: "{params.text}"'
)
async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
text=params.text,
duration_seconds=params.duration_seconds,
prompt_influence=params.prompt_influence,
)
async for chunk in async_iter:
if chunk:
yield chunk
@auto_retry
async def sound_generation_consumed(params: SoundEffectsParams):
aiterator = sound_generation_astream(params=params)
return [x async for x in aiterator]
|