ESPnet
Chinese
audio
singing-voice-synthesis
ftshijt commited on
Commit
fa28f04
·
1 Parent(s): f066c15

Update model

Browse files
Files changed (30) hide show
  1. README.md +549 -1
  2. exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz +3 -0
  3. exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz +3 -0
  4. exp/svs_visinger_normal/500epoch.pth +3 -0
  5. exp/svs_visinger_normal/config.yaml +468 -0
  6. exp/svs_visinger_normal/images/discriminator_backward_time.png +0 -0
  7. exp/svs_visinger_normal/images/discriminator_fake_loss.png +0 -0
  8. exp/svs_visinger_normal/images/discriminator_forward_time.png +0 -0
  9. exp/svs_visinger_normal/images/discriminator_loss.png +0 -0
  10. exp/svs_visinger_normal/images/discriminator_optim_step_time.png +0 -0
  11. exp/svs_visinger_normal/images/discriminator_real_loss.png +0 -0
  12. exp/svs_visinger_normal/images/discriminator_train_time.png +0 -0
  13. exp/svs_visinger_normal/images/generator_adv_loss.png +0 -0
  14. exp/svs_visinger_normal/images/generator_backward_time.png +0 -0
  15. exp/svs_visinger_normal/images/generator_feat_match_loss.png +0 -0
  16. exp/svs_visinger_normal/images/generator_forward_time.png +0 -0
  17. exp/svs_visinger_normal/images/generator_kl_loss.png +0 -0
  18. exp/svs_visinger_normal/images/generator_loss.png +0 -0
  19. exp/svs_visinger_normal/images/generator_mel_loss.png +0 -0
  20. exp/svs_visinger_normal/images/generator_optim_step_time.png +0 -0
  21. exp/svs_visinger_normal/images/generator_phn_dur_loss.png +0 -0
  22. exp/svs_visinger_normal/images/generator_pitch_loss.png +0 -0
  23. exp/svs_visinger_normal/images/generator_score_dur_loss.png +0 -0
  24. exp/svs_visinger_normal/images/generator_train_time.png +0 -0
  25. exp/svs_visinger_normal/images/gpu_max_cached_mem_GB.png +0 -0
  26. exp/svs_visinger_normal/images/iter_time.png +0 -0
  27. exp/svs_visinger_normal/images/optim0_lr0.png +0 -0
  28. exp/svs_visinger_normal/images/optim1_lr0.png +0 -0
  29. exp/svs_visinger_normal/images/train_time.png +0 -0
  30. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,551 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: zh
7
+ datasets:
8
+ - opencpop
9
+ license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `espnet/opencpop_visinger`
15
+
16
+ This model was trained by ftshijt using opencpop recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 5c4d7cf7feba8461de2e1080bf82182f0efaef38
26
+ pip install -e .
27
+ cd egs2/opencpop/svs1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/opencpop_visinger
29
+ ```
30
+
31
+
32
+
33
+ ## SVS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/transfer_visinger.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/svs_visinger_normal
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 0
49
+ num_att_plot: 3
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 500
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - train
77
+ - total_count
78
+ - max
79
+ keep_nbest_models: 10
80
+ nbest_averaging_interval: 0
81
+ grad_clip: -1
82
+ grad_clip_type: 2.0
83
+ grad_noise: false
84
+ accum_grad: 1
85
+ no_forward_run: false
86
+ resume: true
87
+ train_dtype: float32
88
+ use_amp: false
89
+ log_interval: 50
90
+ use_matplotlib: true
91
+ use_tensorboard: true
92
+ create_graph_in_tensorboard: false
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ use_lora: false
101
+ save_lora_only: true
102
+ lora_conf: {}
103
+ pretrain_path: null
104
+ init_param: []
105
+ ignore_init_mismatch: false
106
+ freeze_param: []
107
+ num_iters_per_epoch: 1000
108
+ batch_size: 8
109
+ valid_batch_size: null
110
+ batch_bins: 1000000
111
+ valid_batch_bins: null
112
+ train_shape_file:
113
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
114
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
115
+ valid_shape_file:
116
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
117
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
118
+ batch_type: sorted
119
+ valid_batch_type: null
120
+ fold_length:
121
+ - 150
122
+ - 409600
123
+ sort_in_batch: descending
124
+ shuffle_within_batch: false
125
+ sort_batch: descending
126
+ multiple_iterator: false
127
+ chunk_length: 500
128
+ chunk_shift_ratio: 0.5
129
+ num_cache_chunks: 1024
130
+ chunk_excluded_key_prefixes: []
131
+ chunk_default_fs: null
132
+ train_data_path_and_name_and_type:
133
+ - - dump/raw/tr_no_dev/text
134
+ - text
135
+ - text
136
+ - - dump/raw/tr_no_dev/wav.scp
137
+ - singing
138
+ - sound
139
+ - - dump/raw/tr_no_dev/label
140
+ - label
141
+ - duration
142
+ - - dump/raw/tr_no_dev/score.scp
143
+ - score
144
+ - score
145
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
146
+ - pitch
147
+ - npy
148
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
149
+ - feats
150
+ - npy
151
+ valid_data_path_and_name_and_type:
152
+ - - dump/raw/dev/text
153
+ - text
154
+ - text
155
+ - - dump/raw/dev/wav.scp
156
+ - singing
157
+ - sound
158
+ - - dump/raw/dev/label
159
+ - label
160
+ - duration
161
+ - - dump/raw/dev/score.scp
162
+ - score
163
+ - score
164
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
165
+ - pitch
166
+ - npy
167
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
168
+ - feats
169
+ - npy
170
+ allow_variable_data_keys: false
171
+ max_cache_size: 0.0
172
+ max_cache_fd: 32
173
+ allow_multi_rates: false
174
+ valid_max_cache_size: null
175
+ exclude_weight_decay: false
176
+ exclude_weight_decay_conf: {}
177
+ optim: adamw
178
+ optim_conf:
179
+ lr: 0.0002
180
+ betas:
181
+ - 0.8
182
+ - 0.99
183
+ eps: 1.0e-09
184
+ weight_decay: 0.0
185
+ scheduler: exponentiallr
186
+ scheduler_conf:
187
+ gamma: 0.998
188
+ optim2: adamw
189
+ optim2_conf:
190
+ lr: 0.0002
191
+ betas:
192
+ - 0.8
193
+ - 0.99
194
+ eps: 1.0e-09
195
+ weight_decay: 0.0
196
+ scheduler2: exponentiallr
197
+ scheduler2_conf:
198
+ gamma: 0.998
199
+ generator_first: true
200
+ token_list:
201
+ - <blank>
202
+ - <unk>
203
+ - SP
204
+ - i
205
+ - AP
206
+ - e
207
+ - y
208
+ - d
209
+ - w
210
+ - sh
211
+ - ai
212
+ - n
213
+ - x
214
+ - j
215
+ - ian
216
+ - u
217
+ - l
218
+ - h
219
+ - b
220
+ - o
221
+ - zh
222
+ - an
223
+ - ou
224
+ - m
225
+ - q
226
+ - z
227
+ - en
228
+ - g
229
+ - ing
230
+ - ei
231
+ - ao
232
+ - ang
233
+ - uo
234
+ - eng
235
+ - t
236
+ - a
237
+ - ong
238
+ - ui
239
+ - k
240
+ - f
241
+ - r
242
+ - iang
243
+ - ch
244
+ - v
245
+ - in
246
+ - iao
247
+ - ie
248
+ - iu
249
+ - c
250
+ - s
251
+ - van
252
+ - p
253
+ - ve
254
+ - uan
255
+ - uang
256
+ - ia
257
+ - ua
258
+ - uai
259
+ - un
260
+ - er
261
+ - vn
262
+ - iong
263
+ - <sos/eos>
264
+ odim: null
265
+ model_conf: {}
266
+ use_preprocessor: true
267
+ token_type: phn
268
+ bpemodel: null
269
+ non_linguistic_symbols: null
270
+ cleaner: null
271
+ g2p: null
272
+ fs: 44100
273
+ score_feats_extract: syllable_score_feats
274
+ score_feats_extract_conf:
275
+ fs: 44100
276
+ n_fft: 2048
277
+ win_length: 2048
278
+ hop_length: 512
279
+ feats_extract: fbank
280
+ feats_extract_conf:
281
+ n_fft: 2048
282
+ hop_length: 512
283
+ win_length: 2048
284
+ fs: 44100
285
+ fmin: 0
286
+ fmax: 22050
287
+ n_mels: 80
288
+ normalize: global_mvn
289
+ normalize_conf:
290
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
291
+ svs: vits
292
+ svs_conf:
293
+ generator_type: visinger
294
+ vocoder_generator_type: hifigan
295
+ generator_params:
296
+ hidden_channels: 192
297
+ spks: -1
298
+ global_channels: 256
299
+ segment_size: 20
300
+ text_encoder_attention_heads: 2
301
+ text_encoder_ffn_expand: 4
302
+ text_encoder_blocks: 6
303
+ text_encoder_positionwise_layer_type: conv1d
304
+ text_encoder_positionwise_conv_kernel_size: 3
305
+ text_encoder_positional_encoding_layer_type: rel_pos
306
+ text_encoder_self_attention_layer_type: rel_selfattn
307
+ text_encoder_activation_type: swish
308
+ text_encoder_normalize_before: true
309
+ text_encoder_dropout_rate: 0.1
310
+ text_encoder_positional_dropout_rate: 0.0
311
+ text_encoder_attention_dropout_rate: 0.1
312
+ use_macaron_style_in_text_encoder: true
313
+ use_conformer_conv_in_text_encoder: false
314
+ text_encoder_conformer_kernel_size: -1
315
+ decoder_kernel_size: 7
316
+ decoder_channels: 512
317
+ decoder_upsample_scales:
318
+ - 8
319
+ - 8
320
+ - 4
321
+ - 2
322
+ decoder_upsample_kernel_sizes:
323
+ - 16
324
+ - 16
325
+ - 8
326
+ - 4
327
+ decoder_resblock_kernel_sizes:
328
+ - 3
329
+ - 7
330
+ - 11
331
+ decoder_resblock_dilations:
332
+ - - 1
333
+ - 3
334
+ - 5
335
+ - - 1
336
+ - 3
337
+ - 5
338
+ - - 1
339
+ - 3
340
+ - 5
341
+ use_weight_norm_in_decoder: true
342
+ posterior_encoder_kernel_size: 3
343
+ posterior_encoder_layers: 8
344
+ posterior_encoder_stacks: 1
345
+ posterior_encoder_base_dilation: 1
346
+ posterior_encoder_dropout_rate: 0.0
347
+ use_weight_norm_in_posterior_encoder: true
348
+ flow_flows: -1
349
+ flow_kernel_size: 5
350
+ flow_base_dilation: 1
351
+ flow_layers: 4
352
+ flow_dropout_rate: 0.0
353
+ use_weight_norm_in_flow: true
354
+ use_only_mean_in_flow: true
355
+ use_phoneme_predictor: false
356
+ vocabs: 63
357
+ aux_channels: 80
358
+ generator_type: visinger
359
+ vocoder_generator_type: hifigan
360
+ fs: 44100
361
+ hop_length: 512
362
+ win_length: 2048
363
+ n_fft: 2048
364
+ discriminator_type: visinger2
365
+ discriminator_params:
366
+ scales: 1
367
+ scale_downsample_pooling: AvgPool1d
368
+ scale_downsample_pooling_params:
369
+ kernel_size: 4
370
+ stride: 2
371
+ padding: 2
372
+ scale_discriminator_params:
373
+ in_channels: 1
374
+ out_channels: 1
375
+ kernel_sizes:
376
+ - 15
377
+ - 41
378
+ - 5
379
+ - 3
380
+ channels: 128
381
+ max_downsample_channels: 1024
382
+ max_groups: 256
383
+ bias: true
384
+ downsample_scales:
385
+ - 4
386
+ - 4
387
+ - 4
388
+ - 4
389
+ nonlinear_activation: LeakyReLU
390
+ nonlinear_activation_params:
391
+ negative_slope: 0.1
392
+ use_weight_norm: true
393
+ use_spectral_norm: false
394
+ follow_official_norm: false
395
+ periods:
396
+ - 2
397
+ - 3
398
+ - 5
399
+ - 7
400
+ - 11
401
+ period_discriminator_params:
402
+ in_channels: 1
403
+ out_channels: 1
404
+ kernel_sizes:
405
+ - 5
406
+ - 3
407
+ channels: 32
408
+ downsample_scales:
409
+ - 3
410
+ - 3
411
+ - 3
412
+ - 3
413
+ - 1
414
+ max_downsample_channels: 1024
415
+ bias: true
416
+ nonlinear_activation: LeakyReLU
417
+ nonlinear_activation_params:
418
+ negative_slope: 0.1
419
+ use_weight_norm: true
420
+ use_spectral_norm: false
421
+ multi_freq_disc_params:
422
+ hidden_channels:
423
+ - 256
424
+ - 256
425
+ - 256
426
+ - 256
427
+ - 256
428
+ domain: double
429
+ mel_scale: true
430
+ divisors:
431
+ - 32
432
+ - 16
433
+ - 8
434
+ - 4
435
+ - 2
436
+ - 1
437
+ - 1
438
+ strides:
439
+ - 1
440
+ - 2
441
+ - 1
442
+ - 2
443
+ - 1
444
+ - 2
445
+ - 1
446
+ sample_rate: 44100
447
+ hop_lengths:
448
+ - 110
449
+ - 220
450
+ - 330
451
+ - 441
452
+ - 551
453
+ - 661
454
+ generator_adv_loss_params:
455
+ average_by_discriminators: false
456
+ loss_type: mse
457
+ discriminator_adv_loss_params:
458
+ average_by_discriminators: false
459
+ loss_type: mse
460
+ feat_match_loss_params:
461
+ average_by_discriminators: false
462
+ average_by_layers: false
463
+ include_final_outputs: true
464
+ mel_loss_params:
465
+ fs: 44100
466
+ n_fft: 2048
467
+ hop_length: 512
468
+ win_length: 2048
469
+ window: hann
470
+ n_mels: 80
471
+ fmin: 0
472
+ fmax: 22050
473
+ log_base: null
474
+ lambda_adv: 1.0
475
+ lambda_mel: 45.0
476
+ lambda_feat_match: 2.0
477
+ lambda_dur: 0.1
478
+ lambda_pitch: 10.0
479
+ lambda_phoneme: 1.0
480
+ lambda_kl: 1.0
481
+ sampling_rate: 44100
482
+ cache_generator_outputs: true
483
+ pitch_extract: dio
484
+ pitch_extract_conf:
485
+ use_token_averaged_f0: false
486
+ use_log_f0: false
487
+ fs: 44100
488
+ n_fft: 2048
489
+ hop_length: 512
490
+ f0max: 800
491
+ f0min: 80
492
+ pitch_normalize: null
493
+ pitch_normalize_conf:
494
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
495
+ ying_extract: null
496
+ ying_extract_conf: {}
497
+ energy_extract: null
498
+ energy_extract_conf: {}
499
+ energy_normalize: null
500
+ energy_normalize_conf: {}
501
+ required:
502
+ - output_dir
503
+ - token_list
504
+ version: '202310'
505
+ distributed: false
506
+ ```
507
+
508
+ </details>
509
+
510
+
511
+
512
+ ### Citing ESPnet
513
+
514
+ ```BibTex
515
+ @inproceedings{watanabe2018espnet,
516
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
517
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
518
+ year={2018},
519
+ booktitle={Proceedings of Interspeech},
520
+ pages={2207--2211},
521
+ doi={10.21437/Interspeech.2018-1456},
522
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
523
+ }
524
+
525
+
526
+
527
+
528
+
529
+
530
+ @inproceedings{shi22d_interspeech,
531
+ author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
532
+ title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
533
+ year=2022,
534
+ booktitle={Proc. Interspeech 2022},
535
+ pages={4277--4281},
536
+ doi={10.21437/Interspeech.2022-10039}
537
+ }
538
+ ```
539
+
540
+ or arXiv:
541
+
542
+ ```bibtex
543
+ @misc{watanabe2018espnet,
544
+ title={ESPnet: End-to-End Speech Processing Toolkit},
545
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
546
+ year={2018},
547
+ eprint={1804.00015},
548
+ archivePrefix={arXiv},
549
+ primaryClass={cs.CL}
550
+ }
551
+ ```
exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23f6d25618b7284ac21e4e958c0eefb8a323535273a53892077dc1d1a58e6fdd
3
+ size 1402
exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7aa72becc4e249311822747d67725f972281198f0368361a8d6528e9d308cac
3
+ size 770
exp/svs_visinger_normal/500epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1177d882c10c3509917b4eb40c3d031443b0a137d2af9b280906ed38b3a0e2a
3
+ size 430950107
exp/svs_visinger_normal/config.yaml ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/transfer_visinger.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/svs_visinger_normal
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 0
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 500
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param: []
68
+ ignore_init_mismatch: false
69
+ freeze_param: []
70
+ num_iters_per_epoch: 1000
71
+ batch_size: 8
72
+ valid_batch_size: null
73
+ batch_bins: 1000000
74
+ valid_batch_bins: null
75
+ train_shape_file:
76
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
77
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
78
+ valid_shape_file:
79
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
80
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
81
+ batch_type: sorted
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 150
85
+ - 409600
86
+ sort_in_batch: descending
87
+ shuffle_within_batch: false
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 500
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ chunk_excluded_key_prefixes: []
94
+ chunk_default_fs: null
95
+ train_data_path_and_name_and_type:
96
+ - - dump/raw/tr_no_dev/text
97
+ - text
98
+ - text
99
+ - - dump/raw/tr_no_dev/wav.scp
100
+ - singing
101
+ - sound
102
+ - - dump/raw/tr_no_dev/label
103
+ - label
104
+ - duration
105
+ - - dump/raw/tr_no_dev/score.scp
106
+ - score
107
+ - score
108
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
109
+ - pitch
110
+ - npy
111
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
112
+ - feats
113
+ - npy
114
+ valid_data_path_and_name_and_type:
115
+ - - dump/raw/dev/text
116
+ - text
117
+ - text
118
+ - - dump/raw/dev/wav.scp
119
+ - singing
120
+ - sound
121
+ - - dump/raw/dev/label
122
+ - label
123
+ - duration
124
+ - - dump/raw/dev/score.scp
125
+ - score
126
+ - score
127
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
128
+ - pitch
129
+ - npy
130
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
131
+ - feats
132
+ - npy
133
+ allow_variable_data_keys: false
134
+ max_cache_size: 0.0
135
+ max_cache_fd: 32
136
+ allow_multi_rates: false
137
+ valid_max_cache_size: null
138
+ exclude_weight_decay: false
139
+ exclude_weight_decay_conf: {}
140
+ optim: adamw
141
+ optim_conf:
142
+ lr: 0.0002
143
+ betas:
144
+ - 0.8
145
+ - 0.99
146
+ eps: 1.0e-09
147
+ weight_decay: 0.0
148
+ scheduler: exponentiallr
149
+ scheduler_conf:
150
+ gamma: 0.998
151
+ optim2: adamw
152
+ optim2_conf:
153
+ lr: 0.0002
154
+ betas:
155
+ - 0.8
156
+ - 0.99
157
+ eps: 1.0e-09
158
+ weight_decay: 0.0
159
+ scheduler2: exponentiallr
160
+ scheduler2_conf:
161
+ gamma: 0.998
162
+ generator_first: true
163
+ token_list:
164
+ - <blank>
165
+ - <unk>
166
+ - SP
167
+ - i
168
+ - AP
169
+ - e
170
+ - y
171
+ - d
172
+ - w
173
+ - sh
174
+ - ai
175
+ - n
176
+ - x
177
+ - j
178
+ - ian
179
+ - u
180
+ - l
181
+ - h
182
+ - b
183
+ - o
184
+ - zh
185
+ - an
186
+ - ou
187
+ - m
188
+ - q
189
+ - z
190
+ - en
191
+ - g
192
+ - ing
193
+ - ei
194
+ - ao
195
+ - ang
196
+ - uo
197
+ - eng
198
+ - t
199
+ - a
200
+ - ong
201
+ - ui
202
+ - k
203
+ - f
204
+ - r
205
+ - iang
206
+ - ch
207
+ - v
208
+ - in
209
+ - iao
210
+ - ie
211
+ - iu
212
+ - c
213
+ - s
214
+ - van
215
+ - p
216
+ - ve
217
+ - uan
218
+ - uang
219
+ - ia
220
+ - ua
221
+ - uai
222
+ - un
223
+ - er
224
+ - vn
225
+ - iong
226
+ - <sos/eos>
227
+ odim: null
228
+ model_conf: {}
229
+ use_preprocessor: true
230
+ token_type: phn
231
+ bpemodel: null
232
+ non_linguistic_symbols: null
233
+ cleaner: null
234
+ g2p: null
235
+ fs: 44100
236
+ score_feats_extract: syllable_score_feats
237
+ score_feats_extract_conf:
238
+ fs: 44100
239
+ n_fft: 2048
240
+ win_length: 2048
241
+ hop_length: 512
242
+ feats_extract: fbank
243
+ feats_extract_conf:
244
+ n_fft: 2048
245
+ hop_length: 512
246
+ win_length: 2048
247
+ fs: 44100
248
+ fmin: 0
249
+ fmax: 22050
250
+ n_mels: 80
251
+ normalize: global_mvn
252
+ normalize_conf:
253
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
254
+ svs: vits
255
+ svs_conf:
256
+ generator_type: visinger
257
+ vocoder_generator_type: hifigan
258
+ generator_params:
259
+ hidden_channels: 192
260
+ spks: -1
261
+ global_channels: 256
262
+ segment_size: 20
263
+ text_encoder_attention_heads: 2
264
+ text_encoder_ffn_expand: 4
265
+ text_encoder_blocks: 6
266
+ text_encoder_positionwise_layer_type: conv1d
267
+ text_encoder_positionwise_conv_kernel_size: 3
268
+ text_encoder_positional_encoding_layer_type: rel_pos
269
+ text_encoder_self_attention_layer_type: rel_selfattn
270
+ text_encoder_activation_type: swish
271
+ text_encoder_normalize_before: true
272
+ text_encoder_dropout_rate: 0.1
273
+ text_encoder_positional_dropout_rate: 0.0
274
+ text_encoder_attention_dropout_rate: 0.1
275
+ use_macaron_style_in_text_encoder: true
276
+ use_conformer_conv_in_text_encoder: false
277
+ text_encoder_conformer_kernel_size: -1
278
+ decoder_kernel_size: 7
279
+ decoder_channels: 512
280
+ decoder_upsample_scales:
281
+ - 8
282
+ - 8
283
+ - 4
284
+ - 2
285
+ decoder_upsample_kernel_sizes:
286
+ - 16
287
+ - 16
288
+ - 8
289
+ - 4
290
+ decoder_resblock_kernel_sizes:
291
+ - 3
292
+ - 7
293
+ - 11
294
+ decoder_resblock_dilations:
295
+ - - 1
296
+ - 3
297
+ - 5
298
+ - - 1
299
+ - 3
300
+ - 5
301
+ - - 1
302
+ - 3
303
+ - 5
304
+ use_weight_norm_in_decoder: true
305
+ posterior_encoder_kernel_size: 3
306
+ posterior_encoder_layers: 8
307
+ posterior_encoder_stacks: 1
308
+ posterior_encoder_base_dilation: 1
309
+ posterior_encoder_dropout_rate: 0.0
310
+ use_weight_norm_in_posterior_encoder: true
311
+ flow_flows: -1
312
+ flow_kernel_size: 5
313
+ flow_base_dilation: 1
314
+ flow_layers: 4
315
+ flow_dropout_rate: 0.0
316
+ use_weight_norm_in_flow: true
317
+ use_only_mean_in_flow: true
318
+ use_phoneme_predictor: false
319
+ vocabs: 63
320
+ aux_channels: 80
321
+ generator_type: visinger
322
+ vocoder_generator_type: hifigan
323
+ fs: 44100
324
+ hop_length: 512
325
+ win_length: 2048
326
+ n_fft: 2048
327
+ discriminator_type: visinger2
328
+ discriminator_params:
329
+ scales: 1
330
+ scale_downsample_pooling: AvgPool1d
331
+ scale_downsample_pooling_params:
332
+ kernel_size: 4
333
+ stride: 2
334
+ padding: 2
335
+ scale_discriminator_params:
336
+ in_channels: 1
337
+ out_channels: 1
338
+ kernel_sizes:
339
+ - 15
340
+ - 41
341
+ - 5
342
+ - 3
343
+ channels: 128
344
+ max_downsample_channels: 1024
345
+ max_groups: 256
346
+ bias: true
347
+ downsample_scales:
348
+ - 4
349
+ - 4
350
+ - 4
351
+ - 4
352
+ nonlinear_activation: LeakyReLU
353
+ nonlinear_activation_params:
354
+ negative_slope: 0.1
355
+ use_weight_norm: true
356
+ use_spectral_norm: false
357
+ follow_official_norm: false
358
+ periods:
359
+ - 2
360
+ - 3
361
+ - 5
362
+ - 7
363
+ - 11
364
+ period_discriminator_params:
365
+ in_channels: 1
366
+ out_channels: 1
367
+ kernel_sizes:
368
+ - 5
369
+ - 3
370
+ channels: 32
371
+ downsample_scales:
372
+ - 3
373
+ - 3
374
+ - 3
375
+ - 3
376
+ - 1
377
+ max_downsample_channels: 1024
378
+ bias: true
379
+ nonlinear_activation: LeakyReLU
380
+ nonlinear_activation_params:
381
+ negative_slope: 0.1
382
+ use_weight_norm: true
383
+ use_spectral_norm: false
384
+ multi_freq_disc_params:
385
+ hidden_channels:
386
+ - 256
387
+ - 256
388
+ - 256
389
+ - 256
390
+ - 256
391
+ domain: double
392
+ mel_scale: true
393
+ divisors:
394
+ - 32
395
+ - 16
396
+ - 8
397
+ - 4
398
+ - 2
399
+ - 1
400
+ - 1
401
+ strides:
402
+ - 1
403
+ - 2
404
+ - 1
405
+ - 2
406
+ - 1
407
+ - 2
408
+ - 1
409
+ sample_rate: 44100
410
+ hop_lengths:
411
+ - 110
412
+ - 220
413
+ - 330
414
+ - 441
415
+ - 551
416
+ - 661
417
+ generator_adv_loss_params:
418
+ average_by_discriminators: false
419
+ loss_type: mse
420
+ discriminator_adv_loss_params:
421
+ average_by_discriminators: false
422
+ loss_type: mse
423
+ feat_match_loss_params:
424
+ average_by_discriminators: false
425
+ average_by_layers: false
426
+ include_final_outputs: true
427
+ mel_loss_params:
428
+ fs: 44100
429
+ n_fft: 2048
430
+ hop_length: 512
431
+ win_length: 2048
432
+ window: hann
433
+ n_mels: 80
434
+ fmin: 0
435
+ fmax: 22050
436
+ log_base: null
437
+ lambda_adv: 1.0
438
+ lambda_mel: 45.0
439
+ lambda_feat_match: 2.0
440
+ lambda_dur: 0.1
441
+ lambda_pitch: 10.0
442
+ lambda_phoneme: 1.0
443
+ lambda_kl: 1.0
444
+ sampling_rate: 44100
445
+ cache_generator_outputs: true
446
+ pitch_extract: dio
447
+ pitch_extract_conf:
448
+ use_token_averaged_f0: false
449
+ use_log_f0: false
450
+ fs: 44100
451
+ n_fft: 2048
452
+ hop_length: 512
453
+ f0max: 800
454
+ f0min: 80
455
+ pitch_normalize: null
456
+ pitch_normalize_conf:
457
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
458
+ ying_extract: null
459
+ ying_extract_conf: {}
460
+ energy_extract: null
461
+ energy_extract_conf: {}
462
+ energy_normalize: null
463
+ energy_normalize_conf: {}
464
+ required:
465
+ - output_dir
466
+ - token_list
467
+ version: '202310'
468
+ distributed: false
exp/svs_visinger_normal/images/discriminator_backward_time.png ADDED
exp/svs_visinger_normal/images/discriminator_fake_loss.png ADDED
exp/svs_visinger_normal/images/discriminator_forward_time.png ADDED
exp/svs_visinger_normal/images/discriminator_loss.png ADDED
exp/svs_visinger_normal/images/discriminator_optim_step_time.png ADDED
exp/svs_visinger_normal/images/discriminator_real_loss.png ADDED
exp/svs_visinger_normal/images/discriminator_train_time.png ADDED
exp/svs_visinger_normal/images/generator_adv_loss.png ADDED
exp/svs_visinger_normal/images/generator_backward_time.png ADDED
exp/svs_visinger_normal/images/generator_feat_match_loss.png ADDED
exp/svs_visinger_normal/images/generator_forward_time.png ADDED
exp/svs_visinger_normal/images/generator_kl_loss.png ADDED
exp/svs_visinger_normal/images/generator_loss.png ADDED
exp/svs_visinger_normal/images/generator_mel_loss.png ADDED
exp/svs_visinger_normal/images/generator_optim_step_time.png ADDED
exp/svs_visinger_normal/images/generator_phn_dur_loss.png ADDED
exp/svs_visinger_normal/images/generator_pitch_loss.png ADDED
exp/svs_visinger_normal/images/generator_score_dur_loss.png ADDED
exp/svs_visinger_normal/images/generator_train_time.png ADDED
exp/svs_visinger_normal/images/gpu_max_cached_mem_GB.png ADDED
exp/svs_visinger_normal/images/iter_time.png ADDED
exp/svs_visinger_normal/images/optim0_lr0.png ADDED
exp/svs_visinger_normal/images/optim1_lr0.png ADDED
exp/svs_visinger_normal/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202310'
2
+ files:
3
+ model_file: exp/svs_visinger_normal/500epoch.pth
4
+ python: "3.9.16 (main, Mar 8 2023, 14:00:05) \n[GCC 11.2.0]"
5
+ timestamp: 1701751078.94085
6
+ torch: 1.13.1+cu117
7
+ yaml_files:
8
+ train_config: exp/svs_visinger_normal/config.yaml