Text-to-Speech
ESPnet
jp
audio
kazusam commited on
Commit
f25c520
·
1 Parent(s): 17fec5d

Upload 28 files

Browse files
Files changed (28) hide show
  1. .gitattributes +0 -2
  2. README.md +475 -0
  3. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/config.yaml +396 -0
  4. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_backward_time.png +0 -0
  5. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_fake_loss.png +0 -0
  6. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_forward_time.png +0 -0
  7. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_loss.png +0 -0
  8. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_optim_step_time.png +0 -0
  9. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_real_loss.png +0 -0
  10. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_train_time.png +0 -0
  11. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_adv_loss.png +0 -0
  12. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_backward_time.png +0 -0
  13. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_dur_loss.png +0 -0
  14. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_feat_match_loss.png +0 -0
  15. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_forward_time.png +0 -0
  16. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_kl_loss.png +0 -0
  17. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_loss.png +0 -0
  18. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_mel_loss.png +0 -0
  19. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_optim_step_time.png +0 -0
  20. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_train_time.png +0 -0
  21. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/gpu_max_cached_mem_GB.png +0 -0
  22. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/iter_time.png +0 -0
  23. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/optim0_lr0.png +0 -0
  24. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/optim1_lr0.png +0 -0
  25. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/train_time.png +0 -0
  26. exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/train.total_count.ave_3best.pth +3 -0
  27. exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/feats_stats.npz +3 -0
  28. meta.yaml +8 -0
.gitattributes CHANGED
@@ -2,7 +2,6 @@
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
@@ -22,7 +21,6 @@
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
 
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
5
  *.ftz filter=lfs diff=lfs merge=lfs -text
6
  *.gz filter=lfs diff=lfs merge=lfs -text
7
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
21
  *.pt filter=lfs diff=lfs merge=lfs -text
22
  *.pth filter=lfs diff=lfs merge=lfs -text
23
  *.rar filter=lfs diff=lfs merge=lfs -text
 
24
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.tar.* filter=lfs diff=lfs merge=lfs -text
26
  *.tflite filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,478 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: jp
7
+ datasets:
8
+ - amadeus
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `mio/amadeus`
15
+
16
+ This model was trained by mio using [amadeus recipe](https://github.com/mio2333/espnet/tree/master/egs2/amadeus/tts1) in [espnet](https://github.com/espnet/espnet/).
17
+
18
+
19
+ ### Demo: How to use in ESPnet2
20
+
21
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
22
+ if you haven't done that already.
23
+
24
+ ```bash
25
+ cd espnet
26
+ git checkout d5b5ec7b2e77bd3e10707141818b7e6c57ac6b3f
27
+ pip install -e .
28
+ cd egs2/amadeus/tts1
29
+ ./run.sh --skip_data_prep false --skip_train true --download_model mio/amadeus
30
+ ```
31
+
32
+
33
+
34
+ ## TTS config
35
+
36
+ <details><summary>expand</summary>
37
+
38
+ ```
39
+ config: conf/tuning/finetune_vits.yaml
40
+ print_config: false
41
+ log_level: INFO
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ output_dir: exp/tts_amadeus_vits_finetune_from_jsut_32_sentence
45
+ ngpu: 1
46
+ seed: 777
47
+ num_workers: 4
48
+ num_att_plot: 3
49
+ dist_backend: nccl
50
+ dist_init_method: env://
51
+ dist_world_size: null
52
+ dist_rank: null
53
+ local_rank: 0
54
+ dist_master_addr: null
55
+ dist_master_port: null
56
+ dist_launcher: null
57
+ multiprocessing_distributed: false
58
+ unused_parameters: true
59
+ sharded_ddp: false
60
+ cudnn_enabled: true
61
+ cudnn_benchmark: false
62
+ cudnn_deterministic: false
63
+ collect_stats: false
64
+ write_collected_feats: false
65
+ max_epoch: 2000
66
+ patience: null
67
+ val_scheduler_criterion:
68
+ - valid
69
+ - loss
70
+ early_stopping_criterion:
71
+ - valid
72
+ - loss
73
+ - min
74
+ best_model_criterion:
75
+ - - train
76
+ - total_count
77
+ - max
78
+ keep_nbest_models: 3
79
+ nbest_averaging_interval: 0
80
+ grad_clip: -1
81
+ grad_clip_type: 2.0
82
+ grad_noise: false
83
+ accum_grad: 1
84
+ no_forward_run: false
85
+ resume: true
86
+ train_dtype: float32
87
+ use_amp: false
88
+ log_interval: 50
89
+ use_matplotlib: true
90
+ use_tensorboard: true
91
+ create_graph_in_tensorboard: false
92
+ use_wandb: true
93
+ wandb_project: amadeus
94
+ wandb_id: null
95
+ wandb_entity: null
96
+ wandb_name: null
97
+ wandb_model_log_interval: -1
98
+ detect_anomaly: false
99
+ pretrain_path: null
100
+ init_param:
101
+ - downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.total_count.ave_10best.pth:tts:tts
102
+ ignore_init_mismatch: false
103
+ freeze_param: []
104
+ num_iters_per_epoch: null
105
+ batch_size: 20
106
+ valid_batch_size: null
107
+ batch_bins: 5000000
108
+ valid_batch_bins: null
109
+ train_shape_file:
110
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/text_shape.phn
111
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/speech_shape
112
+ valid_shape_file:
113
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/text_shape.phn
114
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/speech_shape
115
+ batch_type: numel
116
+ valid_batch_type: null
117
+ fold_length:
118
+ - 150
119
+ - 204800
120
+ sort_in_batch: descending
121
+ sort_batch: descending
122
+ multiple_iterator: false
123
+ chunk_length: 500
124
+ chunk_shift_ratio: 0.5
125
+ num_cache_chunks: 1024
126
+ train_data_path_and_name_and_type:
127
+ - - dump/22k/raw/train/text
128
+ - text
129
+ - text
130
+ - - dump/22k/raw/train/wav.scp
131
+ - speech
132
+ - sound
133
+ valid_data_path_and_name_and_type:
134
+ - - dump/22k/raw/dev/text
135
+ - text
136
+ - text
137
+ - - dump/22k/raw/dev/wav.scp
138
+ - speech
139
+ - sound
140
+ allow_variable_data_keys: false
141
+ max_cache_size: 0.0
142
+ max_cache_fd: 32
143
+ valid_max_cache_size: null
144
+ optim: adamw
145
+ optim_conf:
146
+ lr: 0.0001
147
+ betas:
148
+ - 0.8
149
+ - 0.99
150
+ eps: 1.0e-09
151
+ weight_decay: 0.0
152
+ scheduler: exponentiallr
153
+ scheduler_conf:
154
+ gamma: 0.999875
155
+ optim2: adamw
156
+ optim2_conf:
157
+ lr: 0.0001
158
+ betas:
159
+ - 0.8
160
+ - 0.99
161
+ eps: 1.0e-09
162
+ weight_decay: 0.0
163
+ scheduler2: exponentiallr
164
+ scheduler2_conf:
165
+ gamma: 0.999875
166
+ generator_first: false
167
+ token_list:
168
+ - <blank>
169
+ - <unk>
170
+ - '1'
171
+ - '2'
172
+ - '0'
173
+ - '3'
174
+ - '4'
175
+ - '-1'
176
+ - '5'
177
+ - a
178
+ - o
179
+ - '-2'
180
+ - i
181
+ - '-3'
182
+ - u
183
+ - e
184
+ - k
185
+ - n
186
+ - t
187
+ - '6'
188
+ - r
189
+ - '-4'
190
+ - s
191
+ - N
192
+ - m
193
+ - pau
194
+ - '7'
195
+ - sh
196
+ - d
197
+ - g
198
+ - w
199
+ - '8'
200
+ - U
201
+ - '-5'
202
+ - I
203
+ - cl
204
+ - h
205
+ - y
206
+ - b
207
+ - '9'
208
+ - j
209
+ - ts
210
+ - ch
211
+ - '-6'
212
+ - z
213
+ - p
214
+ - '-7'
215
+ - f
216
+ - ky
217
+ - ry
218
+ - '-8'
219
+ - gy
220
+ - '-9'
221
+ - hy
222
+ - ny
223
+ - '-10'
224
+ - by
225
+ - my
226
+ - '-11'
227
+ - '-12'
228
+ - '-13'
229
+ - py
230
+ - '-14'
231
+ - '-15'
232
+ - v
233
+ - '10'
234
+ - '-16'
235
+ - '-17'
236
+ - '11'
237
+ - '-21'
238
+ - '-20'
239
+ - '12'
240
+ - '-19'
241
+ - '13'
242
+ - '-18'
243
+ - '14'
244
+ - dy
245
+ - '15'
246
+ - ty
247
+ - '-22'
248
+ - '16'
249
+ - '18'
250
+ - '19'
251
+ - '17'
252
+ - <sos/eos>
253
+ odim: null
254
+ model_conf: {}
255
+ use_preprocessor: true
256
+ token_type: phn
257
+ bpemodel: null
258
+ non_linguistic_symbols: null
259
+ cleaner: jaconv
260
+ g2p: pyopenjtalk_accent_with_pause
261
+ feats_extract: linear_spectrogram
262
+ feats_extract_conf:
263
+ n_fft: 1024
264
+ hop_length: 256
265
+ win_length: null
266
+ normalize: null
267
+ normalize_conf: {}
268
+ tts: vits
269
+ tts_conf:
270
+ generator_type: vits_generator
271
+ generator_params:
272
+ hidden_channels: 192
273
+ spks: -1
274
+ global_channels: -1
275
+ segment_size: 32
276
+ text_encoder_attention_heads: 2
277
+ text_encoder_ffn_expand: 4
278
+ text_encoder_blocks: 6
279
+ text_encoder_positionwise_layer_type: conv1d
280
+ text_encoder_positionwise_conv_kernel_size: 3
281
+ text_encoder_positional_encoding_layer_type: rel_pos
282
+ text_encoder_self_attention_layer_type: rel_selfattn
283
+ text_encoder_activation_type: swish
284
+ text_encoder_normalize_before: true
285
+ text_encoder_dropout_rate: 0.1
286
+ text_encoder_positional_dropout_rate: 0.0
287
+ text_encoder_attention_dropout_rate: 0.1
288
+ use_macaron_style_in_text_encoder: true
289
+ use_conformer_conv_in_text_encoder: false
290
+ text_encoder_conformer_kernel_size: -1
291
+ decoder_kernel_size: 7
292
+ decoder_channels: 512
293
+ decoder_upsample_scales:
294
+ - 8
295
+ - 8
296
+ - 2
297
+ - 2
298
+ decoder_upsample_kernel_sizes:
299
+ - 16
300
+ - 16
301
+ - 4
302
+ - 4
303
+ decoder_resblock_kernel_sizes:
304
+ - 3
305
+ - 7
306
+ - 11
307
+ decoder_resblock_dilations:
308
+ - - 1
309
+ - 3
310
+ - 5
311
+ - - 1
312
+ - 3
313
+ - 5
314
+ - - 1
315
+ - 3
316
+ - 5
317
+ use_weight_norm_in_decoder: true
318
+ posterior_encoder_kernel_size: 5
319
+ posterior_encoder_layers: 16
320
+ posterior_encoder_stacks: 1
321
+ posterior_encoder_base_dilation: 1
322
+ posterior_encoder_dropout_rate: 0.0
323
+ use_weight_norm_in_posterior_encoder: true
324
+ flow_flows: 4
325
+ flow_kernel_size: 5
326
+ flow_base_dilation: 1
327
+ flow_layers: 4
328
+ flow_dropout_rate: 0.0
329
+ use_weight_norm_in_flow: true
330
+ use_only_mean_in_flow: true
331
+ stochastic_duration_predictor_kernel_size: 3
332
+ stochastic_duration_predictor_dropout_rate: 0.5
333
+ stochastic_duration_predictor_flows: 4
334
+ stochastic_duration_predictor_dds_conv_layers: 3
335
+ vocabs: 85
336
+ aux_channels: 513
337
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
338
+ discriminator_params:
339
+ scales: 1
340
+ scale_downsample_pooling: AvgPool1d
341
+ scale_downsample_pooling_params:
342
+ kernel_size: 4
343
+ stride: 2
344
+ padding: 2
345
+ scale_discriminator_params:
346
+ in_channels: 1
347
+ out_channels: 1
348
+ kernel_sizes:
349
+ - 15
350
+ - 41
351
+ - 5
352
+ - 3
353
+ channels: 128
354
+ max_downsample_channels: 1024
355
+ max_groups: 16
356
+ bias: true
357
+ downsample_scales:
358
+ - 2
359
+ - 2
360
+ - 4
361
+ - 4
362
+ - 1
363
+ nonlinear_activation: LeakyReLU
364
+ nonlinear_activation_params:
365
+ negative_slope: 0.1
366
+ use_weight_norm: true
367
+ use_spectral_norm: false
368
+ follow_official_norm: false
369
+ periods:
370
+ - 2
371
+ - 3
372
+ - 5
373
+ - 7
374
+ - 11
375
+ period_discriminator_params:
376
+ in_channels: 1
377
+ out_channels: 1
378
+ kernel_sizes:
379
+ - 5
380
+ - 3
381
+ channels: 32
382
+ downsample_scales:
383
+ - 3
384
+ - 3
385
+ - 3
386
+ - 3
387
+ - 1
388
+ max_downsample_channels: 1024
389
+ bias: true
390
+ nonlinear_activation: LeakyReLU
391
+ nonlinear_activation_params:
392
+ negative_slope: 0.1
393
+ use_weight_norm: true
394
+ use_spectral_norm: false
395
+ generator_adv_loss_params:
396
+ average_by_discriminators: false
397
+ loss_type: mse
398
+ discriminator_adv_loss_params:
399
+ average_by_discriminators: false
400
+ loss_type: mse
401
+ feat_match_loss_params:
402
+ average_by_discriminators: false
403
+ average_by_layers: false
404
+ include_final_outputs: true
405
+ mel_loss_params:
406
+ fs: 22050
407
+ n_fft: 1024
408
+ hop_length: 256
409
+ win_length: null
410
+ window: hann
411
+ n_mels: 80
412
+ fmin: 0
413
+ fmax: null
414
+ log_base: null
415
+ lambda_adv: 1.0
416
+ lambda_mel: 45.0
417
+ lambda_feat_match: 2.0
418
+ lambda_dur: 1.0
419
+ lambda_kl: 1.0
420
+ sampling_rate: 22050
421
+ cache_generator_outputs: true
422
+ pitch_extract: null
423
+ pitch_extract_conf: {}
424
+ pitch_normalize: null
425
+ pitch_normalize_conf: {}
426
+ energy_extract: null
427
+ energy_extract_conf: {}
428
+ energy_normalize: null
429
+ energy_normalize_conf: {}
430
+ required:
431
+ - output_dir
432
+ - token_list
433
+ version: '202207'
434
+ distributed: false
435
+ ```
436
+
437
+ </details>
438
+
439
+
440
+
441
+ ### Citing ESPnet
442
+
443
+ ```BibTex
444
+ @inproceedings{watanabe2018espnet,
445
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
446
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
447
+ year={2018},
448
+ booktitle={Proceedings of Interspeech},
449
+ pages={2207--2211},
450
+ doi={10.21437/Interspeech.2018-1456},
451
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
452
+ }
453
+
454
+
455
+
456
+
457
+ @inproceedings{hayashi2020espnet,
458
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
459
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
460
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
461
+ pages={7654--7658},
462
+ year={2020},
463
+ organization={IEEE}
464
+ }
465
+ ```
466
+
467
+ or arXiv:
468
+
469
+ ```bibtex
470
+ @misc{watanabe2018espnet,
471
+ title={ESPnet: End-to-End Speech Processing Toolkit},
472
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
473
+ year={2018},
474
+ eprint={1804.00015},
475
+ archivePrefix={arXiv},
476
+ primaryClass={cs.CL}
477
+ }
478
+ ```
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/config.yaml ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/finetune_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_amadeus_vits_finetune_from_jsut_32_sentence
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 2000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 3
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 50
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: true
55
+ wandb_project: amadeus
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param:
63
+ - downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.total_count.ave_10best.pth:tts:tts
64
+ ignore_init_mismatch: false
65
+ freeze_param: []
66
+ num_iters_per_epoch: null
67
+ batch_size: 20
68
+ valid_batch_size: null
69
+ batch_bins: 5000000
70
+ valid_batch_bins: null
71
+ train_shape_file:
72
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/text_shape.phn
73
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/speech_shape
74
+ valid_shape_file:
75
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/text_shape.phn
76
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/speech_shape
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length:
80
+ - 150
81
+ - 204800
82
+ sort_in_batch: descending
83
+ sort_batch: descending
84
+ multiple_iterator: false
85
+ chunk_length: 500
86
+ chunk_shift_ratio: 0.5
87
+ num_cache_chunks: 1024
88
+ train_data_path_and_name_and_type:
89
+ - - dump/22k/raw/train/text
90
+ - text
91
+ - text
92
+ - - dump/22k/raw/train/wav.scp
93
+ - speech
94
+ - sound
95
+ valid_data_path_and_name_and_type:
96
+ - - dump/22k/raw/dev/text
97
+ - text
98
+ - text
99
+ - - dump/22k/raw/dev/wav.scp
100
+ - speech
101
+ - sound
102
+ allow_variable_data_keys: false
103
+ max_cache_size: 0.0
104
+ max_cache_fd: 32
105
+ valid_max_cache_size: null
106
+ optim: adamw
107
+ optim_conf:
108
+ lr: 0.0001
109
+ betas:
110
+ - 0.8
111
+ - 0.99
112
+ eps: 1.0e-09
113
+ weight_decay: 0.0
114
+ scheduler: exponentiallr
115
+ scheduler_conf:
116
+ gamma: 0.999875
117
+ optim2: adamw
118
+ optim2_conf:
119
+ lr: 0.0001
120
+ betas:
121
+ - 0.8
122
+ - 0.99
123
+ eps: 1.0e-09
124
+ weight_decay: 0.0
125
+ scheduler2: exponentiallr
126
+ scheduler2_conf:
127
+ gamma: 0.999875
128
+ generator_first: false
129
+ token_list:
130
+ - <blank>
131
+ - <unk>
132
+ - '1'
133
+ - '2'
134
+ - '0'
135
+ - '3'
136
+ - '4'
137
+ - '-1'
138
+ - '5'
139
+ - a
140
+ - o
141
+ - '-2'
142
+ - i
143
+ - '-3'
144
+ - u
145
+ - e
146
+ - k
147
+ - n
148
+ - t
149
+ - '6'
150
+ - r
151
+ - '-4'
152
+ - s
153
+ - N
154
+ - m
155
+ - pau
156
+ - '7'
157
+ - sh
158
+ - d
159
+ - g
160
+ - w
161
+ - '8'
162
+ - U
163
+ - '-5'
164
+ - I
165
+ - cl
166
+ - h
167
+ - y
168
+ - b
169
+ - '9'
170
+ - j
171
+ - ts
172
+ - ch
173
+ - '-6'
174
+ - z
175
+ - p
176
+ - '-7'
177
+ - f
178
+ - ky
179
+ - ry
180
+ - '-8'
181
+ - gy
182
+ - '-9'
183
+ - hy
184
+ - ny
185
+ - '-10'
186
+ - by
187
+ - my
188
+ - '-11'
189
+ - '-12'
190
+ - '-13'
191
+ - py
192
+ - '-14'
193
+ - '-15'
194
+ - v
195
+ - '10'
196
+ - '-16'
197
+ - '-17'
198
+ - '11'
199
+ - '-21'
200
+ - '-20'
201
+ - '12'
202
+ - '-19'
203
+ - '13'
204
+ - '-18'
205
+ - '14'
206
+ - dy
207
+ - '15'
208
+ - ty
209
+ - '-22'
210
+ - '16'
211
+ - '18'
212
+ - '19'
213
+ - '17'
214
+ - <sos/eos>
215
+ odim: null
216
+ model_conf: {}
217
+ use_preprocessor: true
218
+ token_type: phn
219
+ bpemodel: null
220
+ non_linguistic_symbols: null
221
+ cleaner: jaconv
222
+ g2p: pyopenjtalk_accent_with_pause
223
+ feats_extract: linear_spectrogram
224
+ feats_extract_conf:
225
+ n_fft: 1024
226
+ hop_length: 256
227
+ win_length: null
228
+ normalize: null
229
+ normalize_conf: {}
230
+ tts: vits
231
+ tts_conf:
232
+ generator_type: vits_generator
233
+ generator_params:
234
+ hidden_channels: 192
235
+ spks: -1
236
+ global_channels: -1
237
+ segment_size: 32
238
+ text_encoder_attention_heads: 2
239
+ text_encoder_ffn_expand: 4
240
+ text_encoder_blocks: 6
241
+ text_encoder_positionwise_layer_type: conv1d
242
+ text_encoder_positionwise_conv_kernel_size: 3
243
+ text_encoder_positional_encoding_layer_type: rel_pos
244
+ text_encoder_self_attention_layer_type: rel_selfattn
245
+ text_encoder_activation_type: swish
246
+ text_encoder_normalize_before: true
247
+ text_encoder_dropout_rate: 0.1
248
+ text_encoder_positional_dropout_rate: 0.0
249
+ text_encoder_attention_dropout_rate: 0.1
250
+ use_macaron_style_in_text_encoder: true
251
+ use_conformer_conv_in_text_encoder: false
252
+ text_encoder_conformer_kernel_size: -1
253
+ decoder_kernel_size: 7
254
+ decoder_channels: 512
255
+ decoder_upsample_scales:
256
+ - 8
257
+ - 8
258
+ - 2
259
+ - 2
260
+ decoder_upsample_kernel_sizes:
261
+ - 16
262
+ - 16
263
+ - 4
264
+ - 4
265
+ decoder_resblock_kernel_sizes:
266
+ - 3
267
+ - 7
268
+ - 11
269
+ decoder_resblock_dilations:
270
+ - - 1
271
+ - 3
272
+ - 5
273
+ - - 1
274
+ - 3
275
+ - 5
276
+ - - 1
277
+ - 3
278
+ - 5
279
+ use_weight_norm_in_decoder: true
280
+ posterior_encoder_kernel_size: 5
281
+ posterior_encoder_layers: 16
282
+ posterior_encoder_stacks: 1
283
+ posterior_encoder_base_dilation: 1
284
+ posterior_encoder_dropout_rate: 0.0
285
+ use_weight_norm_in_posterior_encoder: true
286
+ flow_flows: 4
287
+ flow_kernel_size: 5
288
+ flow_base_dilation: 1
289
+ flow_layers: 4
290
+ flow_dropout_rate: 0.0
291
+ use_weight_norm_in_flow: true
292
+ use_only_mean_in_flow: true
293
+ stochastic_duration_predictor_kernel_size: 3
294
+ stochastic_duration_predictor_dropout_rate: 0.5
295
+ stochastic_duration_predictor_flows: 4
296
+ stochastic_duration_predictor_dds_conv_layers: 3
297
+ vocabs: 85
298
+ aux_channels: 513
299
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
300
+ discriminator_params:
301
+ scales: 1
302
+ scale_downsample_pooling: AvgPool1d
303
+ scale_downsample_pooling_params:
304
+ kernel_size: 4
305
+ stride: 2
306
+ padding: 2
307
+ scale_discriminator_params:
308
+ in_channels: 1
309
+ out_channels: 1
310
+ kernel_sizes:
311
+ - 15
312
+ - 41
313
+ - 5
314
+ - 3
315
+ channels: 128
316
+ max_downsample_channels: 1024
317
+ max_groups: 16
318
+ bias: true
319
+ downsample_scales:
320
+ - 2
321
+ - 2
322
+ - 4
323
+ - 4
324
+ - 1
325
+ nonlinear_activation: LeakyReLU
326
+ nonlinear_activation_params:
327
+ negative_slope: 0.1
328
+ use_weight_norm: true
329
+ use_spectral_norm: false
330
+ follow_official_norm: false
331
+ periods:
332
+ - 2
333
+ - 3
334
+ - 5
335
+ - 7
336
+ - 11
337
+ period_discriminator_params:
338
+ in_channels: 1
339
+ out_channels: 1
340
+ kernel_sizes:
341
+ - 5
342
+ - 3
343
+ channels: 32
344
+ downsample_scales:
345
+ - 3
346
+ - 3
347
+ - 3
348
+ - 3
349
+ - 1
350
+ max_downsample_channels: 1024
351
+ bias: true
352
+ nonlinear_activation: LeakyReLU
353
+ nonlinear_activation_params:
354
+ negative_slope: 0.1
355
+ use_weight_norm: true
356
+ use_spectral_norm: false
357
+ generator_adv_loss_params:
358
+ average_by_discriminators: false
359
+ loss_type: mse
360
+ discriminator_adv_loss_params:
361
+ average_by_discriminators: false
362
+ loss_type: mse
363
+ feat_match_loss_params:
364
+ average_by_discriminators: false
365
+ average_by_layers: false
366
+ include_final_outputs: true
367
+ mel_loss_params:
368
+ fs: 22050
369
+ n_fft: 1024
370
+ hop_length: 256
371
+ win_length: null
372
+ window: hann
373
+ n_mels: 80
374
+ fmin: 0
375
+ fmax: null
376
+ log_base: null
377
+ lambda_adv: 1.0
378
+ lambda_mel: 45.0
379
+ lambda_feat_match: 2.0
380
+ lambda_dur: 1.0
381
+ lambda_kl: 1.0
382
+ sampling_rate: 22050
383
+ cache_generator_outputs: true
384
+ pitch_extract: null
385
+ pitch_extract_conf: {}
386
+ pitch_normalize: null
387
+ pitch_normalize_conf: {}
388
+ energy_extract: null
389
+ energy_extract_conf: {}
390
+ energy_normalize: null
391
+ energy_normalize_conf: {}
392
+ required:
393
+ - output_dir
394
+ - token_list
395
+ version: '202207'
396
+ distributed: false
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_backward_time.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_fake_loss.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_forward_time.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_loss.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_optim_step_time.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_real_loss.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/discriminator_train_time.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_adv_loss.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_backward_time.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_dur_loss.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_feat_match_loss.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_forward_time.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_kl_loss.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_loss.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_mel_loss.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_optim_step_time.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/generator_train_time.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/iter_time.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/optim0_lr0.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/optim1_lr0.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/images/train_time.png ADDED
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/train.total_count.ave_3best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47f585212ab86e97fd712b3cd0e8f9cf4ff75a37ecd3f01343b4db7022873c25
3
+ size 372564559
exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:943c319e34197e5c4b875da0c6ef76872fac941532c8341f81440a6cc2050f78
3
+ size 4866
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202207'
2
+ files:
3
+ model_file: exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/train.total_count.ave_3best.pth
4
+ python: "3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]"
5
+ timestamp: 1662182871.367335
6
+ torch: 1.8.1
7
+ yaml_files:
8
+ train_config: exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/config.yaml