from dataclasses import dataclass, field from typing import List, Literal, Tuple, Union PreprocessLangType = Literal["english_only", "multilingual"] @dataclass class STFTConfig: filter_length: int hop_length: int win_length: int n_mel_channels: int mel_fmin: int mel_fmax: int # Base class used with the Univnet vocoder @dataclass class PreprocessingConfig: language: PreprocessLangType stft: STFTConfig sampling_rate: int = 22050 min_seconds: float = 0.5 max_seconds: float = 6.0 use_audio_normalization: bool = True workers: int = 8 @dataclass class PreprocessingConfigUnivNet(PreprocessingConfig): stft: STFTConfig = field( default_factory=lambda: STFTConfig( filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=100, # univnet mel_fmin=20, mel_fmax=11025, ), ) @dataclass class PreprocessingConfigHifiGAN(PreprocessingConfig): stft: STFTConfig = field( default_factory=lambda: STFTConfig( filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, # For univnet 100 mel_fmin=20, mel_fmax=11025, ), ) def __post_init__(self): r"""It modifies the 'stft' attribute based on the 'sampling_rate' attribute. If 'sampling_rate' is 44100, 'stft' is set with specific values for this rate. If 'sampling_rate' is not 22050 or 44100, a ValueError is raised. Raises: ValueError: If 'sampling_rate' is not 22050 or 44100. """ if self.sampling_rate == 44100: self.stft = STFTConfig( filter_length=2048, hop_length=512, # NOTE: 441 ?? https://github.com/jik876/hifi-gan/issues/116#issuecomment-1436999858 win_length=2048, n_mel_channels=80, # Based on https://github.com/jik876/hifi-gan/issues/116 mel_fmin=20, mel_fmax=11025, ) if self.sampling_rate not in [22050, 44100]: raise ValueError("Sampling rate must be 22050 or 44100") @dataclass class AcousticTrainingOptimizerConfig: learning_rate: float weight_decay: float lr_decay: float betas: Tuple[float, float] = (0.9, 0.98) eps: float = 0.000000001 grad_clip_thresh: float = 1.0 warm_up_step: float = 4000 anneal_steps: List[int] = field(default_factory=list) anneal_rate: float = 0.3 @dataclass class AcousticFinetuningConfig: batch_size = 5 grad_acc_step = 3 train_steps = 30000 log_step = 100 synth_step = 250 val_step = 4000 save_step = 250 freeze_bert_until = 0 mcd_gen_max_samples = 400 only_train_speaker_until = 5000 optimizer_config: AcousticTrainingOptimizerConfig = field( default_factory=lambda: AcousticTrainingOptimizerConfig( learning_rate=0.0002, weight_decay=0.001, lr_decay=0.99999, ), ) @dataclass class AcousticPretrainingConfig: batch_size = 5 grad_acc_step = 5 train_steps = 500000 log_step = 20 synth_step = 250 val_step = 4000 save_step = 1000 freeze_bert_until = 4000 mcd_gen_max_samples = 400 only_train_speaker_until = 0 optimizer_config: AcousticTrainingOptimizerConfig = field( default_factory=lambda: AcousticTrainingOptimizerConfig( learning_rate=0.0002, weight_decay=0.01, lr_decay=1.0, ), ) AcousticTrainingConfig = Union[AcousticFinetuningConfig, AcousticPretrainingConfig] @dataclass class ConformerConfig: n_layers: int n_heads: int n_hidden: int p_dropout: float kernel_size_conv_mod: int kernel_size_depthwise: int with_ff: bool @dataclass class ReferenceEncoderConfig: bottleneck_size_p: int bottleneck_size_u: int ref_enc_filters: List[int] ref_enc_size: int ref_enc_strides: List[int] ref_enc_pad: List[int] ref_enc_gru_size: int ref_attention_dropout: float token_num: int predictor_kernel_size: int @dataclass class VarianceAdaptorConfig: n_hidden: int kernel_size: int emb_kernel_size: int p_dropout: float n_bins: int @dataclass class AcousticLossConfig: ssim_loss_alpha: float mel_loss_alpha: float aligner_loss_alpha: float pitch_loss_alpha: float energy_loss_alpha: float u_prosody_loss_alpha: float p_prosody_loss_alpha: float dur_loss_alpha: float binary_align_loss_alpha: float binary_loss_warmup_epochs: int @dataclass class AcousticENModelConfig: speaker_embed_dim: int = 1024 lang_embed_dim: int = 1 encoder: ConformerConfig = field( default_factory=lambda: ConformerConfig( n_layers=6, n_heads=8, n_hidden=512, p_dropout=0.1, kernel_size_conv_mod=7, kernel_size_depthwise=7, with_ff=True, ), ) decoder: ConformerConfig = field( default_factory=lambda: ConformerConfig( n_layers=6, n_heads=8, n_hidden=512, p_dropout=0.1, kernel_size_conv_mod=11, kernel_size_depthwise=11, with_ff=True, ), ) reference_encoder: ReferenceEncoderConfig = field( default_factory=lambda: ReferenceEncoderConfig( bottleneck_size_p=4, bottleneck_size_u=256, ref_enc_filters=[32, 32, 64, 64, 128, 128], ref_enc_size=3, ref_enc_strides=[1, 2, 1, 2, 1], ref_enc_pad=[1, 1], ref_enc_gru_size=32, ref_attention_dropout=0.2, token_num=32, predictor_kernel_size=5, ), ) variance_adaptor: VarianceAdaptorConfig = field( default_factory=lambda: VarianceAdaptorConfig( n_hidden=512, kernel_size=5, emb_kernel_size=3, p_dropout=0.5, n_bins=256, ), ) loss: AcousticLossConfig = field( default_factory=lambda: AcousticLossConfig( ssim_loss_alpha=1.0, mel_loss_alpha=1.0, aligner_loss_alpha=1.0, pitch_loss_alpha=1.0, energy_loss_alpha=1.0, u_prosody_loss_alpha=0.25, p_prosody_loss_alpha=0.25, dur_loss_alpha=1.0, binary_align_loss_alpha=0.1, binary_loss_warmup_epochs=10, ), ) @dataclass class AcousticMultilingualModelConfig: speaker_embed_dim: int = 1024 lang_embed_dim: int = 256 encoder: ConformerConfig = field( default_factory=lambda: ConformerConfig( n_layers=6, n_heads=8, n_hidden=512, p_dropout=0.1, kernel_size_conv_mod=7, kernel_size_depthwise=7, with_ff=True, ), ) decoder: ConformerConfig = field( default_factory=lambda: ConformerConfig( n_layers=6, n_heads=8, n_hidden=512, p_dropout=0.1, kernel_size_conv_mod=11, kernel_size_depthwise=11, with_ff=True, ), ) reference_encoder: ReferenceEncoderConfig = field( default_factory=lambda: ReferenceEncoderConfig( bottleneck_size_p=4, bottleneck_size_u=256, ref_enc_filters=[32, 32, 64, 64, 128, 128], ref_enc_size=3, ref_enc_strides=[1, 2, 1, 2, 1], ref_enc_pad=[1, 1], ref_enc_gru_size=32, ref_attention_dropout=0.2, token_num=32, predictor_kernel_size=5, ), ) variance_adaptor: VarianceAdaptorConfig = field( default_factory=lambda: VarianceAdaptorConfig( n_hidden=512, kernel_size=5, emb_kernel_size=3, p_dropout=0.5, n_bins=256, ), ) loss: AcousticLossConfig = field( default_factory=lambda: AcousticLossConfig( ssim_loss_alpha=1.0, mel_loss_alpha=1.0, aligner_loss_alpha=1.0, pitch_loss_alpha=1.0, energy_loss_alpha=1.0, u_prosody_loss_alpha=0.25, p_prosody_loss_alpha=0.25, dur_loss_alpha=1.0, binary_align_loss_alpha=0.1, binary_loss_warmup_epochs=10, ), ) AcousticModelConfigType = Union[AcousticENModelConfig, AcousticMultilingualModelConfig] @dataclass class VocoderBasicConfig: segment_size: int = 16384 learning_rate: float = 0.0001 adam_b1: float = 0.5 adam_b2: float = 0.9 lr_decay: float = 0.995 synth_interval: int = 250 checkpoint_interval: int = 250 stft_lamb: float = 2.5 @dataclass class VocoderPretrainingConfig(VocoderBasicConfig): batch_size: int = 14 grad_accum_steps: int = 1 train_steps: int = 1000000 stdout_interval: int = 25 validation_interval: int = 2000 @dataclass class VocoderFinetuningConfig(VocoderBasicConfig): batch_size: int = 5 grad_accum_steps: int = 3 train_steps: int = 10000 stdout_interval: int = 100 validation_interval: int = 4000 VoicoderTrainingConfig = Union[VocoderPretrainingConfig, VocoderFinetuningConfig] @dataclass class VocoderGeneratorConfig: noise_dim: int channel_size: int dilations: List[int] strides: List[int] lReLU_slope: float kpnet_conv_size: int @dataclass class VocoderMPDConfig: periods: List[int] kernel_size: int stride: int use_spectral_norm: bool lReLU_slope: float @dataclass class VocoderMRDConfig: resolutions: List[Tuple[int, int, int]] use_spectral_norm: bool lReLU_slope: float @dataclass class VocoderModelConfig: gen: VocoderGeneratorConfig = field( default_factory=lambda: VocoderGeneratorConfig( noise_dim=64, channel_size=32, dilations=[1, 3, 9, 27], strides=[8, 8, 4], lReLU_slope=0.2, kpnet_conv_size=3, ), ) mpd: VocoderMPDConfig = field( default_factory=lambda: VocoderMPDConfig( periods=[2, 3, 5, 7, 11], kernel_size=5, stride=3, use_spectral_norm=False, lReLU_slope=0.2, ), ) mrd: VocoderMRDConfig = field( default_factory=lambda: VocoderMRDConfig( resolutions=[(1024, 120, 600), (2048, 240, 1200), (512, 50, 240)], use_spectral_norm=False, lReLU_slope=0.2, ), ) ##################### # HI-FI GAN CONFIGS # ##################### @dataclass class HifiGanPretrainingConfig(VocoderBasicConfig): segment_size: int = 16384 learning_rate: float = 0.0002 adam_b1: float = 0.8 adam_b2: float = 0.99 lr_decay: float = 0.9995 lReLU_slope: float = 0.1 l1_factor: int = 45 sampling_rate_acoustic: int = 22050 sampling_rate_vocoder: int = 44100 @dataclass class HifiGanConfig: resblock: str = "1" upsample_rates: List[int] = field( default_factory=lambda: [8, 8, 4, 2], ) upsample_kernel_sizes: List[int] = field( default_factory=lambda: [16, 16, 4, 4], ) upsample_initial_channel: int = 512 resblock_kernel_sizes: List[int] = field( default_factory=lambda: [3, 7, 11], ) resblock_dilation_sizes: List[List[int]] = field( default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]], )