File size: 3,994 Bytes
14d1720
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

dataset:
  train:
    wav_scp: './train/wav.scp'
    mel_scp: './train/mel.scp'
    dur_scp: './train/dur.scp'
    emb_type1:
      _name: 'pinyin'
      scp: './train/py.scp'
      vocab: 'py.vocab'
    emb_type2:
      _name: 'graphic'
      scp: './train/gp.scp'
      vocab: 'gp.vocab'
    emb_type3:
      _name: 'speaker'
      scp: './train/spk.scp'
      vocab: ~ # dosn't need vocab
    # NOTE: you can add more embedding here without changing the code.
  eval:
    # NOTE: this is not used for now, i.e., just training, no evaluation.
    # You can use synthesize.py to check the training goes well, for now.



training:
  batch_size: 1024
  batch_split: 64

  epochs: 100000
  grad_clip_thresh: 1.0
  acc_step: 1
  checkpoint_path: "./checkpoints/"
  log_path: "./log/"
  checkpoint_step: 5000
  synth_step: 5000
  log_step: 20
  num_workers: 8

  evaluation_step: 1000

optimizer: # NOTE: if use SGD, params should change too, as it has different arguments.
    type: Adam
    n_warm_up_step: 2000
    lr_decrease_step: 10000
    lr_decrease_factor:
    params:
      betas: [0.9,0.98]
      eps: !!float 1e-9
      weight_decay: !!float 0.0
      lr: !!float  1e-4
lr_scheduler:
    type: CyclicLR
    params:
      base_lr: !!float 1e-8
      max_lr: !!float 1e-6
      step_size_up: 5000
      step_size_down: 5000
      cycle_momentum: False

fbank: # this is used for wav2mel.py
  sample_rate: 22050
  n_fft: 1024
  hop_length: 256
  win_length: 1024
  max_wav_value: 32768.0
  n_mels: 80
  fmin: 0.0
  fmax: 8000.0 # should be 11025 ?
  mel_mean: -6.0304103

encoder:
    encoder_type: 'FS2TransformerEncoder'
    conf:
      n_layers: 4
      n_heads: 2
      hidden_dim: 256
      dropout: 0.25
      d_inner: 1024
      max_len: 2048

decoder:
    decoder_type: 'FS2TransformerDecoder'
    input_dim: 256 # should be the same as the output of encoder
    n_layers: 4
    n_heads: 2
    hidden_dim: 256
    d_inner: 1024
    dropout: 0.25
    max_len: 2048 # max len of seq, for position embedding pre-computation

#(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2,
#hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048
postnet:
  postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d'

speaker_embedding:
    enable: True
    vocab: #None
    vocab_size: 218 # aishell3 has 218 speakers
    weight: 1.0 # you can play with weight here
    dim: 256

utterence_embedding:
    enable: False # not implemented
    type: 'lstm' # resnet
    feature_config:
      type: 'mel'
      n_mels: 80
      sampling_rate: 22050
      n_fft: 1024
      hop_length: 256
      win_length: 1024

hanzi_embedding:
    enable: True
    type: embedding
    vocab: './gp.vocab'
    dim: 256
    weight: 0.5 # you can play with weight here
    max_seq_len: 100

pinyin_embedding:
    enable: True
    type: embedding
    vocab: './py.vocab'
    dim: 256
    weight: 1.0
    max_seq_len: 100

duration_predictor:
  input_dim: 256 # should be the same as encoder hiddien_dim
  filter_size: 256
  kernel_size: 3
  dropout: 0.15 # important to set dropout here
  duration_mean: 21.517294924096635 #for aishell3

f0_predictor:
  enable: False # currently not supported
  filter_size: 256
  kernel_size: 3
  dropout: 0.5
  n_bins: 256

vocoder:
  type: VocGan # choose one of the following
  MelGAN:
    checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth
    config: ~/checkpoints/melgan/default.yaml
    device: cpu
  VocGan:
    checkpoint: ~/checkpoints/vctk_pretrained_model_3180.pt #~/checkpoints/ljspeech_29de09d_4000.pt
    denoise: True
    device: cpu
  HiFiGAN:
    checkpoint: ~/checkpoints/VCTK_V3/generator_v3  # you need to download checkpoint and set the params here
    device: cpu
  Waveglow:
    checkpoint:  ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt
    sigma: 1.0
    denoiser_strength: 0.0 # try 0.1
    device: cpu #try cpu if out of memory



synthesis:
  normalize: True # normalize the sound volume