Spaces:
Running
Running
from dataclasses import dataclass, field | |
from typing import List | |
from TTS.tts.configs.shared_configs import BaseTTSConfig | |
from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig | |
class XttsConfig(BaseTTSConfig): | |
"""Defines parameters for XTTS TTS model. | |
Args: | |
model (str): | |
Model name. Do not change unless you know what you are doing. | |
model_args (XttsArgs): | |
Model architecture arguments. Defaults to `XttsArgs()`. | |
audio (XttsAudioConfig): | |
Audio processing configuration. Defaults to `XttsAudioConfig()`. | |
model_dir (str): | |
Path to the folder that has all the XTTS models. Defaults to None. | |
temperature (float): | |
Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`. | |
length_penalty (float): | |
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, | |
which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), | |
length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences. | |
repetition_penalty (float): | |
The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`. | |
top_p (float): | |
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. | |
Defaults to `0.8`. | |
num_gpt_outputs (int): | |
Number of samples taken from the autoregressive model, all of which are filtered using CLVP. | |
As XTTS is a probabilistic model, more samples means a higher probability of creating something "great". | |
Defaults to `16`. | |
gpt_cond_len (int): | |
Secs audio to be used as conditioning for the autoregressive model. Defaults to `12`. | |
gpt_cond_chunk_len (int): | |
Audio chunk size in secs. Audio is split into chunks and latents are extracted for each chunk. Then the | |
latents are averaged. Chunking improves the stability. It must be <= gpt_cond_len. | |
If gpt_cond_len == gpt_cond_chunk_len, no chunking. Defaults to `4`. | |
max_ref_len (int): | |
Maximum number of seconds of audio to be used as conditioning for the decoder. Defaults to `10`. | |
sound_norm_refs (bool): | |
Whether to normalize the conditioning audio. Defaults to `False`. | |
Note: | |
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. | |
Example: | |
>>> from TTS.tts.configs.xtts_config import XttsConfig | |
>>> config = XttsConfig() | |
""" | |
model: str = "xtts" | |
# model specific params | |
model_args: XttsArgs = field(default_factory=XttsArgs) | |
audio: XttsAudioConfig = field(default_factory=XttsAudioConfig) | |
model_dir: str = None | |
languages: List[str] = field( | |
default_factory=lambda: [ | |
"en", | |
"es", | |
"fr", | |
"de", | |
"it", | |
"pt", | |
"pl", | |
"tr", | |
"ru", | |
"nl", | |
"cs", | |
"ar", | |
"zh-cn", | |
"hu", | |
"ko", | |
"ja", | |
] | |
) | |
# inference params | |
temperature: float = 0.85 | |
length_penalty: float = 1.0 | |
repetition_penalty: float = 2.0 | |
top_k: int = 50 | |
top_p: float = 0.85 | |
num_gpt_outputs: int = 1 | |
# cloning | |
gpt_cond_len: int = 12 | |
gpt_cond_chunk_len: int = 4 | |
max_ref_len: int = 10 | |
sound_norm_refs: bool = False | |