from dataclasses import dataclass @dataclass class HParams: ### Signal Processing (used in both synthesizer and vocoder) sample_rate = 16000 n_fft = 800 num_mels = 80 hop_size = 200 """Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)""" win_size = 800 """Tacotron uses 50 ms frame length (set to sample_rate * 0.050)""" fmin = 55 min_level_db = -100 ref_level_db = 20 max_abs_value = 4.0 """Gradient explodes if too big, premature convergence if too small.""" preemphasis = 0.97 """Filter coefficient to use if preemphasize is True""" preemphasize = True ### Tacotron Text-to-Speech (TTS) tts_embed_dims = 512 """Embedding dimension for the graphemes/phoneme inputs""" tts_encoder_dims = 256 tts_decoder_dims = 128 tts_postnet_dims = 512 tts_encoder_K = 5 tts_lstm_dims = 1024 tts_postnet_K = 5 tts_num_highways = 4 tts_dropout = 0.5 tts_cleaner_names = ["basic_cleaners"] tts_stop_threshold = -3.4 """ Value below which audio generation ends. For example, for a range of [-4, 4], this will terminate the sequence at the first frame that has all values < -3.4 """ ### Tacotron Training tts_schedule = [ (2, 1e-3, 10_000, 12), (2, 5e-4, 15_000, 12), (2, 2e-4, 20_000, 12), (2, 1e-4, 30_000, 12), (2, 5e-5, 40_000, 12), (2, 1e-5, 60_000, 12), (2, 5e-6, 160_000, 12), (2, 3e-6, 320_000, 12), (2, 1e-6, 640_000, 12), ] """ Progressive training schedule (r, lr, step, batch_size) r = reduction factor (# of mel frames synthesized for each decoder iteration) lr = learning rate """ tts_clip_grad_norm = 1.0 """clips the gradient norm to prevent explosion - set to None if not needed""" tts_eval_interval = 500 """ Number of steps between model evaluation (sample generation) Set to -1 to generate after completing epoch, or 0 to disable """ tts_eval_num_samples = 1 """Makes this number of samples""" tts_finetune_layers = [] """For finetune usage, if set, only selected layers will be trained, available: encoder,encoder_proj,gst,decoder,postnet,post_proj""" ### Data Preprocessing max_mel_frames = 900 rescale = True rescaling_max = 0.9 synthesis_batch_size = 16 """For vocoder preprocessing and inference.""" ### Mel Visualization and Griffin-Lim signal_normalization = True power = 1.5 griffin_lim_iters = 60 ### Audio processing options fmax = 7600 """Should not exceed (sample_rate // 2)""" allow_clipping_in_normalization = True """Used when signal_normalization = True""" clip_mels_length = True """If true, discards samples exceeding max_mel_frames""" use_lws = False """Fast spectrogram phase recovery using local weighted sums""" symmetric_mels = True """Sets mel range to [-max_abs_value, max_abs_value] if True, and [0, max_abs_value] if False""" trim_silence = True """Use with sample_rate of 16000 for best results""" ### SV2TTS speaker_embedding_size = 256 """Dimension for the speaker embedding""" silence_min_duration_split = 0.4 """Duration in seconds of a silence for an utterance to be split""" utterance_min_duration = 1.6 """Duration in seconds below which utterances are discarded""" use_gst = True """Whether to use global style token""" use_ser_for_gst = True """Whether to use speaker embedding referenced for global style token""" hparams = HParams()