File size: 3,510 Bytes
14d1720
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9270314
14d1720
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

dataset:
  train:
    wav_scp: './train/wav.scp'
    mel_scp: './train/mel.scp'
    dur_scp: './train/dur.scp'
    emb_type1:
      _name: 'pinyin'
      scp: './train/py.scp'
      vocab: 'py.vocab'
    emb_type2:
      _name: 'graphic'
      scp: './train/gp.scp'
      vocab: 'gp.vocab'

    emb_type3:
      _name: 'speaker'
      scp: './train/spk.scp'
      vocab: # dosn't need vocab

training:
  batch_size: 16
  batch_split: 1
  epochs: 10000
  grad_clip_thresh: 1.0
  acc_step: 1
  checkpoint_path: "./checkpoints/"
  log_path: "./log/"
  checkpoint_step: 5000
  synth_step: 5000
  log_step: 20
  num_workers: 8

  evaluation_step: 1000

optimizer:
    type: Adam
    n_warm_up_step: 2000
    #lr_decrease_step: 10000
    #lr_decrease_factor:
    params:
      betas: [0.9,0.98]
      eps: !!float 1e-9
      weight_decay: !!float 0.0
      lr: !!float  1e-4
lr_scheduler:
    type: CyclicLR
    params:
      base_lr: !!float 1e-7
      max_lr: !!float 1e-4
      step_size_up: 5000
      step_size_down: 8000
      cycle_momentum: False

vocoder:
  type: VocGan # choose one of the following
  MelGAN:
    checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth
    config: ~/checkpoints/melgan/default.yaml
    device: cpu
  VocGan:
    checkpoint: checkpoints #~/checkpoints/ljspeech_29de09d_4000.pt
    denoise: True
    device: cpu
  HiFiGAN:
    checkpoint: ~/checkpoints/VCTK_V3/generator_v3  # you need to download checkpoint and set the params here
    device: cpu
  Waveglow:
    checkpoint:  ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt
    sigma: 1.0
    denoiser_strength: 0.0 # try 0.1
    device: cpu #try cpu if out of memory

fbank:
  sample_rate: 16000
  n_fft: 1024
  hop_length: 256
  win_length: 1024
  max_wav_value: 32768.0
  n_mels: 80
  fmin: 0.0
  fmax: 8000.0 # should be 11025
  mel_mean: -6.0304103

encoder:
    encoder_type: 'FS2TransformerEncoder'
    conf:
      n_layers: 4
      n_heads: 2
      hidden_dim: 256
      dropout: 0.25
      d_inner: 1024
      max_len: 2048

decoder:
    decoder_type: 'FS2TransformerDecoder'
    input_dim: 256 # should be the same as the output of encoder
    n_layers: 4
    n_heads: 2
    hidden_dim: 256
    d_inner: 1024
    dropout: 0.25
    max_len: 2048 # max len of seq, for position embedding pre-computation

#(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2,
#hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048
postnet:
  postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d'
speaker_embedding:
    enable: True
    vocab: #None
    vocab_size: 1 #
    weight: 1.0
    dim: 256

utterence_embedding:
    enable: False
    type: 'lstm' # resnet
    feature_config:
      type: 'mel'
      n_mels: 80
      sampling_rate: 22050
      n_fft: 1024
      hop_length: 256
      win_length: 1024

    model_config:
      n_layers: 3
      bidirectional: True

hanzi_embedding:
    enable: True
    type: embedding
    vocab: './gp.vocab'
    dim: 256
    weight: 0.5
    max_seq_len: 100

pinyin_embedding:
    enable: True
    type: embedding
    vocab: './py.vocab'
    dim: 256
    weight: 1.0
    max_seq_len: 100

duration_predictor:
  input_dim: 256 # should be the same as encoder hiddien_dim
  filter_size: 256
  kernel_size: 3
  dropout: 0.5
  duration_mean: 21.517294924096635 #for aishell3

f0_predictor:
  enable: False
  filter_size: 256
  kernel_size: 3
  dropout: 0.5
  n_bins: 256
synthesis:
  normalize: True # normalize the sound volume