Spaces:
Sleeping
Sleeping
from dataclasses import dataclass | |
from typing import Tuple | |
import torch | |
from models.config import ( | |
SUPPORTED_LANGUAGES, | |
AcousticENModelConfig, | |
AcousticModelConfigType, | |
AcousticPretrainingConfig, | |
) | |
from models.config import ( | |
PreprocessingConfigUnivNet as PreprocessingConfig, | |
) | |
from models.helpers import positional_encoding, tools | |
from models.tts.delightful_tts.acoustic_model import AcousticModel | |
from models.tts.delightful_tts.attention.conformer import Conformer | |
class ConformerConfig: | |
dim: int | |
n_layers: int | |
n_heads: int | |
embedding_dim: int | |
p_dropout: float | |
kernel_size_conv_mod: int | |
with_ff: bool | |
def get_test_configs( | |
srink_factor: int = 4, | |
) -> Tuple[PreprocessingConfig, AcousticENModelConfig, AcousticPretrainingConfig]: | |
r"""Returns a tuple of configuration objects for testing purposes. | |
Args: | |
srink_factor (int, optional): The shrink factor to apply to the model configuration. Defaults to 4. | |
Returns: | |
Tuple[PreprocessingConfig, AcousticENModelConfig, AcousticPretrainingConfig]: A tuple of configuration objects for testing purposes. | |
This function returns a tuple of configuration objects for testing purposes. The configuration objects are as follows: | |
- `PreprocessingConfig`: A configuration object for preprocessing. | |
- `AcousticENModelConfig`: A configuration object for the acoustic model. | |
- `AcousticPretrainingConfig`: A configuration object for acoustic pretraining. | |
The `srink_factor` parameter is used to shrink the dimensions of the model configuration to prevent out of memory issues during testing. | |
""" | |
preprocess_config = PreprocessingConfig("english_only") | |
model_config = AcousticENModelConfig() | |
model_config.speaker_embed_dim = model_config.speaker_embed_dim // srink_factor | |
model_config.encoder.n_hidden = model_config.encoder.n_hidden // srink_factor | |
model_config.decoder.n_hidden = model_config.decoder.n_hidden // srink_factor | |
model_config.variance_adaptor.n_hidden = ( | |
model_config.variance_adaptor.n_hidden // srink_factor | |
) | |
acoustic_pretraining_config = AcousticPretrainingConfig() | |
return (preprocess_config, model_config, acoustic_pretraining_config) | |
# Function to initialize a Conformer with a given AcousticModelConfigType configuration | |
def init_conformer( | |
model_config: AcousticModelConfigType, | |
) -> Tuple[Conformer, ConformerConfig]: | |
r"""Function to initialize a `Conformer` with a given `AcousticModelConfigType` configuration. | |
Args: | |
model_config (AcousticModelConfigType): The object that holds the configuration details. | |
Returns: | |
Conformer: Initialized Conformer object. | |
The function sets the details of the `Conformer` object based on the `model_config` parameter. | |
The `Conformer` configuration is set as follows: | |
- dim: The number of hidden units, taken from the encoder part of the `model_config.encoder.n_hidden`. | |
- n_layers: The number of layers, taken from the encoder part of the `model_config.encoder.n_layers`. | |
- n_heads: The number of attention heads, taken from the encoder part of the `model_config.encoder.n_heads`. | |
- embedding_dim: The sum of dimensions of speaker embeddings and language embeddings. | |
The speaker_embed_dim and lang_embed_dim are a part of the `model_config.speaker_embed_dim`. | |
- p_dropout: Dropout rate taken from the encoder part of the `model_config.encoder.p_dropout`. | |
It adds a regularization parameter to prevent overfitting. | |
- kernel_size_conv_mod: The kernel size for the convolution module taken from the encoder part of the `model_config.encoder.kernel_size_conv_mod`. | |
- with_ff: A Boolean value denoting if feedforward operation is involved, taken from the encoder part of the `model_config.encoder.with_ff`. | |
""" | |
conformer_config = ConformerConfig( | |
dim=model_config.encoder.n_hidden, | |
n_layers=model_config.encoder.n_layers, | |
n_heads=model_config.encoder.n_heads, | |
embedding_dim=model_config.speaker_embed_dim | |
+ model_config.lang_embed_dim, # speaker_embed_dim + lang_embed_dim = 385 | |
p_dropout=model_config.encoder.p_dropout, | |
kernel_size_conv_mod=model_config.encoder.kernel_size_conv_mod, | |
with_ff=model_config.encoder.with_ff, | |
) | |
model = Conformer(**vars(conformer_config)) | |
return model, conformer_config | |
class AcousticModelConfig: | |
preprocess_config: PreprocessingConfig | |
model_config: AcousticENModelConfig | |
n_speakers: int | |
def init_acoustic_model( | |
preprocess_config: PreprocessingConfig, | |
model_config: AcousticENModelConfig, | |
n_speakers: int = 10, | |
) -> Tuple[AcousticModel, AcousticModelConfig]: | |
r"""Function to initialize an `AcousticModel` with given preprocessing and model configurations. | |
Args: | |
preprocess_config (PreprocessingConfig): Configuration object for pre-processing. | |
model_config (AcousticENModelConfig): Configuration object for English Acoustic model. | |
n_speakers (int, optional): Number of speakers. Defaults to 10. | |
Returns: | |
AcousticModel: Initialized Acoustic Model. | |
The function creates an `AcousticModelConfig` instance which is then used to initialize the `AcousticModel`. | |
The `AcousticModelConfig` is configured as follows: | |
- preprocess_config: Pre-processing configuration. | |
- model_config: English Acoustic model configuration. | |
- fine_tuning: Boolean flag set to True indicating the model is for fine-tuning. | |
- n_speakers: Number of speakers. | |
""" | |
# Create an AcousticModelConfig instance | |
acoustic_model_config = AcousticModelConfig( | |
preprocess_config=preprocess_config, | |
model_config=model_config, | |
n_speakers=n_speakers, | |
) | |
model = AcousticModel(**vars(acoustic_model_config)) | |
return model, acoustic_model_config | |
class ForwardTrainParams: | |
x: torch.Tensor | |
speakers: torch.Tensor | |
src_lens: torch.Tensor | |
mels: torch.Tensor | |
mel_lens: torch.Tensor | |
enc_len: torch.Tensor | |
pitches: torch.Tensor | |
pitches_range: Tuple[float, float] | |
energies: torch.Tensor | |
langs: torch.Tensor | |
attn_priors: torch.Tensor | |
use_ground_truth: bool = True | |
def init_forward_trains_params( | |
model_config: AcousticENModelConfig, | |
acoustic_pretraining_config: AcousticPretrainingConfig, | |
preprocess_config: PreprocessingConfig, | |
n_speakers: int = 10, | |
) -> ForwardTrainParams: | |
r"""Function to initialize the parameters for forward propagation during training. | |
Args: | |
model_config (AcousticENModelConfig): Configuration object for English Acoustic model. | |
acoustic_pretraining_config (AcousticPretrainingConfig): Configuration object for acoustic pretraining. | |
preprocess_config (PreprocessingConfig): Configuration object for pre-processing. | |
n_speakers (int, optional): Number of speakers. Defaults to 10. | |
Returns: | |
ForwardTrainParams: Initialized parameters for forward propagation during training. | |
The function initializes the ForwardTrainParams object with the following parameters: | |
- x: Tensor containing the input sequences. Shape: [speaker_embed_dim, batch_size] | |
- speakers: Tensor containing the speaker indices. Shape: [speaker_embed_dim, batch_size] | |
- src_lens: Tensor containing the lengths of source sequences. Shape: [batch_size] | |
- mels: Tensor containing the mel spectrogram. Shape: [batch_size, speaker_embed_dim, encoder.n_hidden] | |
- mel_lens: Tensor containing the lengths of mel sequences. Shape: [batch_size] | |
- pitches: Tensor containing the pitch values. Shape: [batch_size, speaker_embed_dim, encoder.n_hidden] | |
- energies: Tensor containing the energy values. Shape: [batch_size, speaker_embed_dim, encoder.n_hidden] | |
- langs: Tensor containing the language indices. Shape: [speaker_embed_dim, batch_size] | |
- attn_priors: Tensor containing the attention priors. Shape: [batch_size, speaker_embed_dim, speaker_embed_dim] | |
- use_ground_truth: Boolean flag indicating if ground truth values should be used or not. | |
All the Tensors are initialized with random values. | |
""" | |
return ForwardTrainParams( | |
# x: Tensor containing the input sequences. Shape: [speaker_embed_dim, batch_size] | |
x=torch.randint( | |
1, | |
255, | |
( | |
model_config.speaker_embed_dim, | |
acoustic_pretraining_config.batch_size, | |
), | |
), | |
pitches_range=(0.0, 1.0), | |
# speakers: Tensor containing the speaker indices. Shape: [speaker_embed_dim, batch_size] | |
speakers=torch.randint( | |
1, | |
n_speakers - 1, | |
( | |
model_config.speaker_embed_dim, | |
acoustic_pretraining_config.batch_size, | |
), | |
), | |
# src_lens: Tensor containing the lengths of source sequences. Shape: [speaker_embed_dim] | |
src_lens=torch.cat( | |
[ | |
# torch.tensor([self.model_config.speaker_embed_dim]), | |
torch.randint( | |
1, | |
acoustic_pretraining_config.batch_size + 1, | |
(model_config.speaker_embed_dim,), | |
), | |
], | |
dim=0, | |
), | |
# mels: Tensor containing the mel spectrogram. Shape: [batch_size, stft.n_mel_channels, encoder.n_hidden] | |
mels=torch.randn( | |
model_config.speaker_embed_dim, | |
preprocess_config.stft.n_mel_channels, | |
model_config.encoder.n_hidden, | |
), | |
# enc_len: Tensor containing the lengths of mel sequences. Shape: [speaker_embed_dim] | |
enc_len=torch.cat( | |
[ | |
torch.randint( | |
1, | |
model_config.speaker_embed_dim, | |
(model_config.speaker_embed_dim - 1,), | |
), | |
torch.tensor([model_config.speaker_embed_dim]), | |
], | |
dim=0, | |
), | |
# mel_lens: Tensor containing the lengths of mel sequences. Shape: [batch_size] | |
mel_lens=torch.cat( | |
[ | |
torch.randint( | |
1, | |
model_config.speaker_embed_dim, | |
(model_config.speaker_embed_dim - 1,), | |
), | |
torch.tensor([model_config.speaker_embed_dim]), | |
], | |
dim=0, | |
), | |
# pitches: Tensor containing the pitch values. Shape: [batch_size, speaker_embed_dim, encoder.n_hidden] | |
pitches=torch.randn( | |
# acoustic_pretraining_config.batch_size, | |
model_config.speaker_embed_dim, | |
# model_config.speaker_embed_dim, | |
model_config.encoder.n_hidden, | |
), | |
# energies: Tensor containing the energy values. Shape: [batch_size, speaker_embed_dim, encoder.n_hidden] | |
energies=torch.randn( | |
model_config.speaker_embed_dim, | |
1, | |
model_config.encoder.n_hidden, | |
), | |
# langs: Tensor containing the language indices. Shape: [speaker_embed_dim, batch_size] | |
langs=torch.randint( | |
1, | |
len(SUPPORTED_LANGUAGES) - 1, | |
( | |
model_config.speaker_embed_dim, | |
acoustic_pretraining_config.batch_size, | |
), | |
), | |
# attn_priors: Tensor containing the attention priors. Shape: [batch_size, speaker_embed_dim, speaker_embed_dim] | |
attn_priors=torch.randn( | |
model_config.speaker_embed_dim, | |
model_config.speaker_embed_dim, | |
acoustic_pretraining_config.batch_size, | |
), | |
use_ground_truth=True, | |
) | |
def init_mask_input_embeddings_encoding_attn_mask( | |
acoustic_model: AcousticModel, | |
forward_train_params: ForwardTrainParams, | |
model_config: AcousticENModelConfig, | |
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: | |
r"""Function to initialize masks for padding positions, input sequences, embeddings, positional encoding and attention masks. | |
Args: | |
acoustic_model (AcousticModel): Initialized Acoustic Model. | |
forward_train_params (ForwardTrainParams): Parameters for the forward training process. | |
model_config (AcousticENModelConfig): Configuration object for English Acoustic model. | |
Returns: | |
Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: A tuple containing the following elements: | |
- src_mask: Tensor containing the masks for padding positions in the source sequences. Shape: [1, batch_size] | |
- x: Tensor containing the input sequences. Shape: [speaker_embed_dim, batch_size, speaker_embed_dim] | |
- embeddings: Tensor containing the embeddings. Shape: [speaker_embed_dim, batch_size, speaker_embed_dim + lang_embed_dim] | |
- encoding: Tensor containing the positional encoding. Shape: [lang_embed_dim, max(forward_train_params.mel_lens), model_config.encoder.n_hidden] | |
- attn_maskЖ Tensor containing the attention masks. Shape: [1, 1, 1, batch_size] | |
The function starts by generating masks for padding positions in the source and mel sequences. | |
Then, it uses the acoustic model to get the input sequences and embeddings. | |
Finally, it computes the positional encoding. | |
""" | |
# Generate masks for padding positions in the source sequences and mel sequences | |
# src_mask: Tensor containing the masks for padding positions in the source sequences. Shape: [1, batch_size] | |
src_mask = tools.get_mask_from_lengths(forward_train_params.src_lens) | |
# x: Tensor containing the input sequences. Shape: [speaker_embed_dim, batch_size, speaker_embed_dim] | |
# embeddings: Tensor containing the embeddings. Shape: [speaker_embed_dim, batch_size, speaker_embed_dim + lang_embed_dim] | |
x, embeddings = acoustic_model.get_embeddings( | |
token_idx=forward_train_params.x, | |
speaker_idx=forward_train_params.speakers, | |
src_mask=src_mask, | |
lang_idx=forward_train_params.langs, | |
) | |
# encoding: Tensor containing the positional encoding | |
# Shape: [lang_embed_dim, max(forward_train_params.mel_lens), encoder.n_hidden] | |
encoding = positional_encoding( | |
model_config.encoder.n_hidden, | |
max(x.shape[1], int(forward_train_params.mel_lens.max().item())), | |
) | |
attn_mask = src_mask.view((src_mask.shape[0], 1, 1, src_mask.shape[1])) | |
return src_mask, x, embeddings, encoding, attn_mask | |