File size: 3,627 Bytes
14cff58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from dataclasses import dataclass


@dataclass
class HParams:
    ### Signal Processing (used in both synthesizer and vocoder)
    sample_rate = 16000
    n_fft = 800
    num_mels = 80
    hop_size = 200
    """Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)"""
    win_size = 800
    """Tacotron uses 50 ms frame length (set to sample_rate * 0.050)"""
    fmin = 55
    min_level_db = -100
    ref_level_db = 20
    max_abs_value = 4.0
    """Gradient explodes if too big, premature convergence if too small."""
    preemphasis = 0.97
    """Filter coefficient to use if preemphasize is True"""
    preemphasize = True

    ### Tacotron Text-to-Speech (TTS)
    tts_embed_dims = 512
    """Embedding dimension for the graphemes/phoneme inputs"""
    tts_encoder_dims = 256
    tts_decoder_dims = 128
    tts_postnet_dims = 512
    tts_encoder_K = 5
    tts_lstm_dims = 1024
    tts_postnet_K = 5
    tts_num_highways = 4
    tts_dropout = 0.5
    tts_cleaner_names = ["basic_cleaners"]
    tts_stop_threshold = -3.4
    """
    Value below which audio generation ends.
    For example, for a range of [-4, 4], this
    will terminate the sequence at the first
    frame that has all values < -3.4
    """

    ### Tacotron Training
    tts_schedule = [
        (2, 1e-3, 10_000, 12),
        (2, 5e-4, 15_000, 12),
        (2, 2e-4, 20_000, 12),
        (2, 1e-4, 30_000, 12),
        (2, 5e-5, 40_000, 12),
        (2, 1e-5, 60_000, 12),
        (2, 5e-6, 160_000, 12),
        (2, 3e-6, 320_000, 12),
        (2, 1e-6, 640_000, 12),
    ]
    """
    Progressive training schedule
    (r, lr, step, batch_size)
    r = reduction factor (# of mel frames synthesized for each decoder iteration)
    lr = learning rate
    """

    tts_clip_grad_norm = 1.0
    """clips the gradient norm to prevent explosion - set to None if not needed"""
    tts_eval_interval = 500
    """
    Number of steps between model evaluation (sample generation)
    Set to -1 to generate after completing epoch, or 0 to disable
    """
    tts_eval_num_samples = 1
    """Makes this number of samples"""
    tts_finetune_layers = []
    """For finetune usage, if set, only selected layers will be trained, available: encoder,encoder_proj,gst,decoder,postnet,post_proj"""

    ### Data Preprocessing
    max_mel_frames = 900
    rescale = True
    rescaling_max = 0.9
    synthesis_batch_size = 16
    """For vocoder preprocessing and inference."""

    ### Mel Visualization and Griffin-Lim
    signal_normalization = True
    power = 1.5
    griffin_lim_iters = 60

    ### Audio processing options
    fmax = 7600
    """Should not exceed (sample_rate // 2)"""
    allow_clipping_in_normalization = True
    """Used when signal_normalization = True"""
    clip_mels_length = True
    """If true, discards samples exceeding max_mel_frames"""
    use_lws = False
    """Fast spectrogram phase recovery using local weighted sums"""
    symmetric_mels = True
    """Sets mel range to [-max_abs_value, max_abs_value] if True, and [0, max_abs_value] if False"""
    trim_silence = True
    """Use with sample_rate of 16000 for best results"""

    ### SV2TTS
    speaker_embedding_size = 256
    """Dimension for the speaker embedding"""
    silence_min_duration_split = 0.4
    """Duration in seconds of a silence for an utterance to be split"""
    utterance_min_duration = 1.6
    """Duration in seconds below which utterances are discarded"""
    use_gst = True
    """Whether to use global style token"""
    use_ser_for_gst = True
    """Whether to use speaker embedding referenced for global style token"""


hparams = HParams()