ftshijt commited on
Commit
bc22771
·
1 Parent(s): ce1cea9

Update model

Browse files
Files changed (30) hide show
  1. README.md +550 -1
  2. exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz +3 -0
  3. exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz +3 -0
  4. exp/svs_visinger_transfer/500epoch.pth +3 -0
  5. exp/svs_visinger_transfer/config.yaml +469 -0
  6. exp/svs_visinger_transfer/images/discriminator_backward_time.png +0 -0
  7. exp/svs_visinger_transfer/images/discriminator_fake_loss.png +0 -0
  8. exp/svs_visinger_transfer/images/discriminator_forward_time.png +0 -0
  9. exp/svs_visinger_transfer/images/discriminator_loss.png +0 -0
  10. exp/svs_visinger_transfer/images/discriminator_optim_step_time.png +0 -0
  11. exp/svs_visinger_transfer/images/discriminator_real_loss.png +0 -0
  12. exp/svs_visinger_transfer/images/discriminator_train_time.png +0 -0
  13. exp/svs_visinger_transfer/images/generator_adv_loss.png +0 -0
  14. exp/svs_visinger_transfer/images/generator_backward_time.png +0 -0
  15. exp/svs_visinger_transfer/images/generator_feat_match_loss.png +0 -0
  16. exp/svs_visinger_transfer/images/generator_forward_time.png +0 -0
  17. exp/svs_visinger_transfer/images/generator_kl_loss.png +0 -0
  18. exp/svs_visinger_transfer/images/generator_loss.png +0 -0
  19. exp/svs_visinger_transfer/images/generator_mel_loss.png +0 -0
  20. exp/svs_visinger_transfer/images/generator_optim_step_time.png +0 -0
  21. exp/svs_visinger_transfer/images/generator_phn_dur_loss.png +0 -0
  22. exp/svs_visinger_transfer/images/generator_pitch_loss.png +0 -0
  23. exp/svs_visinger_transfer/images/generator_score_dur_loss.png +0 -0
  24. exp/svs_visinger_transfer/images/generator_train_time.png +0 -0
  25. exp/svs_visinger_transfer/images/gpu_max_cached_mem_GB.png +0 -0
  26. exp/svs_visinger_transfer/images/iter_time.png +0 -0
  27. exp/svs_visinger_transfer/images/optim0_lr0.png +0 -0
  28. exp/svs_visinger_transfer/images/optim1_lr0.png +0 -0
  29. exp/svs_visinger_transfer/images/train_time.png +0 -0
  30. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,552 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: zh
7
+ datasets:
8
+ - opencpop
9
+ license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `espnet/opencpop_visinger_transfer_acesinger`
15
+
16
+ This model was trained by ftshijt using opencpop recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 5c4d7cf7feba8461de2e1080bf82182f0efaef38
26
+ pip install -e .
27
+ cd egs2/opencpop/svs1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/opencpop_visinger_transfer_acesinger
29
+ ```
30
+
31
+
32
+
33
+ ## SVS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/transfer_visinger.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/svs_visinger_transfer
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 0
49
+ num_att_plot: 3
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 500
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - train
77
+ - total_count
78
+ - max
79
+ keep_nbest_models: 10
80
+ nbest_averaging_interval: 0
81
+ grad_clip: -1
82
+ grad_clip_type: 2.0
83
+ grad_noise: false
84
+ accum_grad: 1
85
+ no_forward_run: false
86
+ resume: true
87
+ train_dtype: float32
88
+ use_amp: false
89
+ log_interval: 50
90
+ use_matplotlib: true
91
+ use_tensorboard: true
92
+ create_graph_in_tensorboard: false
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ use_lora: false
101
+ save_lora_only: true
102
+ lora_conf: {}
103
+ pretrain_path: null
104
+ init_param:
105
+ - /ocean/projects/cis210027p/jiatong/svs/espnet/egs2/acesinger/svs1/exp/svs_train_raw_phn_None_zh/train.total_count.best.pth
106
+ ignore_init_mismatch: true
107
+ freeze_param: []
108
+ num_iters_per_epoch: 1000
109
+ batch_size: 8
110
+ valid_batch_size: null
111
+ batch_bins: 1000000
112
+ valid_batch_bins: null
113
+ train_shape_file:
114
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
115
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
116
+ valid_shape_file:
117
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
118
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
119
+ batch_type: sorted
120
+ valid_batch_type: null
121
+ fold_length:
122
+ - 150
123
+ - 409600
124
+ sort_in_batch: descending
125
+ shuffle_within_batch: false
126
+ sort_batch: descending
127
+ multiple_iterator: false
128
+ chunk_length: 500
129
+ chunk_shift_ratio: 0.5
130
+ num_cache_chunks: 1024
131
+ chunk_excluded_key_prefixes: []
132
+ chunk_default_fs: null
133
+ train_data_path_and_name_and_type:
134
+ - - dump/raw/tr_no_dev/text
135
+ - text
136
+ - text
137
+ - - dump/raw/tr_no_dev/wav.scp
138
+ - singing
139
+ - sound
140
+ - - dump/raw/tr_no_dev/label
141
+ - label
142
+ - duration
143
+ - - dump/raw/tr_no_dev/score.scp
144
+ - score
145
+ - score
146
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
147
+ - pitch
148
+ - npy
149
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
150
+ - feats
151
+ - npy
152
+ valid_data_path_and_name_and_type:
153
+ - - dump/raw/dev/text
154
+ - text
155
+ - text
156
+ - - dump/raw/dev/wav.scp
157
+ - singing
158
+ - sound
159
+ - - dump/raw/dev/label
160
+ - label
161
+ - duration
162
+ - - dump/raw/dev/score.scp
163
+ - score
164
+ - score
165
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
166
+ - pitch
167
+ - npy
168
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
169
+ - feats
170
+ - npy
171
+ allow_variable_data_keys: false
172
+ max_cache_size: 0.0
173
+ max_cache_fd: 32
174
+ allow_multi_rates: false
175
+ valid_max_cache_size: null
176
+ exclude_weight_decay: false
177
+ exclude_weight_decay_conf: {}
178
+ optim: adamw
179
+ optim_conf:
180
+ lr: 0.0002
181
+ betas:
182
+ - 0.8
183
+ - 0.99
184
+ eps: 1.0e-09
185
+ weight_decay: 0.0
186
+ scheduler: exponentiallr
187
+ scheduler_conf:
188
+ gamma: 0.998
189
+ optim2: adamw
190
+ optim2_conf:
191
+ lr: 0.0002
192
+ betas:
193
+ - 0.8
194
+ - 0.99
195
+ eps: 1.0e-09
196
+ weight_decay: 0.0
197
+ scheduler2: exponentiallr
198
+ scheduler2_conf:
199
+ gamma: 0.998
200
+ generator_first: true
201
+ token_list:
202
+ - <blank>
203
+ - <unk>
204
+ - SP
205
+ - i
206
+ - AP
207
+ - e
208
+ - y
209
+ - d
210
+ - w
211
+ - sh
212
+ - ai
213
+ - n
214
+ - x
215
+ - j
216
+ - ian
217
+ - u
218
+ - l
219
+ - h
220
+ - b
221
+ - o
222
+ - zh
223
+ - an
224
+ - ou
225
+ - m
226
+ - q
227
+ - z
228
+ - en
229
+ - g
230
+ - ing
231
+ - ei
232
+ - ao
233
+ - ang
234
+ - uo
235
+ - eng
236
+ - t
237
+ - a
238
+ - ong
239
+ - ui
240
+ - k
241
+ - f
242
+ - r
243
+ - iang
244
+ - ch
245
+ - v
246
+ - in
247
+ - iao
248
+ - ie
249
+ - iu
250
+ - c
251
+ - s
252
+ - van
253
+ - p
254
+ - ve
255
+ - uan
256
+ - uang
257
+ - ia
258
+ - ua
259
+ - uai
260
+ - un
261
+ - er
262
+ - vn
263
+ - iong
264
+ - <sos/eos>
265
+ odim: null
266
+ model_conf: {}
267
+ use_preprocessor: true
268
+ token_type: phn
269
+ bpemodel: null
270
+ non_linguistic_symbols: null
271
+ cleaner: null
272
+ g2p: null
273
+ fs: 44100
274
+ score_feats_extract: syllable_score_feats
275
+ score_feats_extract_conf:
276
+ fs: 44100
277
+ n_fft: 2048
278
+ win_length: 2048
279
+ hop_length: 512
280
+ feats_extract: fbank
281
+ feats_extract_conf:
282
+ n_fft: 2048
283
+ hop_length: 512
284
+ win_length: 2048
285
+ fs: 44100
286
+ fmin: 0
287
+ fmax: 22050
288
+ n_mels: 80
289
+ normalize: global_mvn
290
+ normalize_conf:
291
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
292
+ svs: vits
293
+ svs_conf:
294
+ generator_type: visinger
295
+ vocoder_generator_type: hifigan
296
+ generator_params:
297
+ hidden_channels: 192
298
+ spks: -1
299
+ global_channels: 256
300
+ segment_size: 20
301
+ text_encoder_attention_heads: 2
302
+ text_encoder_ffn_expand: 4
303
+ text_encoder_blocks: 6
304
+ text_encoder_positionwise_layer_type: conv1d
305
+ text_encoder_positionwise_conv_kernel_size: 3
306
+ text_encoder_positional_encoding_layer_type: rel_pos
307
+ text_encoder_self_attention_layer_type: rel_selfattn
308
+ text_encoder_activation_type: swish
309
+ text_encoder_normalize_before: true
310
+ text_encoder_dropout_rate: 0.1
311
+ text_encoder_positional_dropout_rate: 0.0
312
+ text_encoder_attention_dropout_rate: 0.1
313
+ use_macaron_style_in_text_encoder: true
314
+ use_conformer_conv_in_text_encoder: false
315
+ text_encoder_conformer_kernel_size: -1
316
+ decoder_kernel_size: 7
317
+ decoder_channels: 512
318
+ decoder_upsample_scales:
319
+ - 8
320
+ - 8
321
+ - 4
322
+ - 2
323
+ decoder_upsample_kernel_sizes:
324
+ - 16
325
+ - 16
326
+ - 8
327
+ - 4
328
+ decoder_resblock_kernel_sizes:
329
+ - 3
330
+ - 7
331
+ - 11
332
+ decoder_resblock_dilations:
333
+ - - 1
334
+ - 3
335
+ - 5
336
+ - - 1
337
+ - 3
338
+ - 5
339
+ - - 1
340
+ - 3
341
+ - 5
342
+ use_weight_norm_in_decoder: true
343
+ posterior_encoder_kernel_size: 3
344
+ posterior_encoder_layers: 8
345
+ posterior_encoder_stacks: 1
346
+ posterior_encoder_base_dilation: 1
347
+ posterior_encoder_dropout_rate: 0.0
348
+ use_weight_norm_in_posterior_encoder: true
349
+ flow_flows: -1
350
+ flow_kernel_size: 5
351
+ flow_base_dilation: 1
352
+ flow_layers: 4
353
+ flow_dropout_rate: 0.0
354
+ use_weight_norm_in_flow: true
355
+ use_only_mean_in_flow: true
356
+ use_phoneme_predictor: false
357
+ vocabs: 63
358
+ aux_channels: 80
359
+ generator_type: visinger
360
+ vocoder_generator_type: hifigan
361
+ fs: 44100
362
+ hop_length: 512
363
+ win_length: 2048
364
+ n_fft: 2048
365
+ discriminator_type: visinger2
366
+ discriminator_params:
367
+ scales: 1
368
+ scale_downsample_pooling: AvgPool1d
369
+ scale_downsample_pooling_params:
370
+ kernel_size: 4
371
+ stride: 2
372
+ padding: 2
373
+ scale_discriminator_params:
374
+ in_channels: 1
375
+ out_channels: 1
376
+ kernel_sizes:
377
+ - 15
378
+ - 41
379
+ - 5
380
+ - 3
381
+ channels: 128
382
+ max_downsample_channels: 1024
383
+ max_groups: 256
384
+ bias: true
385
+ downsample_scales:
386
+ - 4
387
+ - 4
388
+ - 4
389
+ - 4
390
+ nonlinear_activation: LeakyReLU
391
+ nonlinear_activation_params:
392
+ negative_slope: 0.1
393
+ use_weight_norm: true
394
+ use_spectral_norm: false
395
+ follow_official_norm: false
396
+ periods:
397
+ - 2
398
+ - 3
399
+ - 5
400
+ - 7
401
+ - 11
402
+ period_discriminator_params:
403
+ in_channels: 1
404
+ out_channels: 1
405
+ kernel_sizes:
406
+ - 5
407
+ - 3
408
+ channels: 32
409
+ downsample_scales:
410
+ - 3
411
+ - 3
412
+ - 3
413
+ - 3
414
+ - 1
415
+ max_downsample_channels: 1024
416
+ bias: true
417
+ nonlinear_activation: LeakyReLU
418
+ nonlinear_activation_params:
419
+ negative_slope: 0.1
420
+ use_weight_norm: true
421
+ use_spectral_norm: false
422
+ multi_freq_disc_params:
423
+ hidden_channels:
424
+ - 256
425
+ - 256
426
+ - 256
427
+ - 256
428
+ - 256
429
+ domain: double
430
+ mel_scale: true
431
+ divisors:
432
+ - 32
433
+ - 16
434
+ - 8
435
+ - 4
436
+ - 2
437
+ - 1
438
+ - 1
439
+ strides:
440
+ - 1
441
+ - 2
442
+ - 1
443
+ - 2
444
+ - 1
445
+ - 2
446
+ - 1
447
+ sample_rate: 44100
448
+ hop_lengths:
449
+ - 110
450
+ - 220
451
+ - 330
452
+ - 441
453
+ - 551
454
+ - 661
455
+ generator_adv_loss_params:
456
+ average_by_discriminators: false
457
+ loss_type: mse
458
+ discriminator_adv_loss_params:
459
+ average_by_discriminators: false
460
+ loss_type: mse
461
+ feat_match_loss_params:
462
+ average_by_discriminators: false
463
+ average_by_layers: false
464
+ include_final_outputs: true
465
+ mel_loss_params:
466
+ fs: 44100
467
+ n_fft: 2048
468
+ hop_length: 512
469
+ win_length: 2048
470
+ window: hann
471
+ n_mels: 80
472
+ fmin: 0
473
+ fmax: 22050
474
+ log_base: null
475
+ lambda_adv: 1.0
476
+ lambda_mel: 45.0
477
+ lambda_feat_match: 2.0
478
+ lambda_dur: 0.1
479
+ lambda_pitch: 10.0
480
+ lambda_phoneme: 1.0
481
+ lambda_kl: 1.0
482
+ sampling_rate: 44100
483
+ cache_generator_outputs: true
484
+ pitch_extract: dio
485
+ pitch_extract_conf:
486
+ use_token_averaged_f0: false
487
+ use_log_f0: false
488
+ fs: 44100
489
+ n_fft: 2048
490
+ hop_length: 512
491
+ f0max: 800
492
+ f0min: 80
493
+ pitch_normalize: null
494
+ pitch_normalize_conf:
495
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
496
+ ying_extract: null
497
+ ying_extract_conf: {}
498
+ energy_extract: null
499
+ energy_extract_conf: {}
500
+ energy_normalize: null
501
+ energy_normalize_conf: {}
502
+ required:
503
+ - output_dir
504
+ - token_list
505
+ version: '202310'
506
+ distributed: false
507
+ ```
508
+
509
+ </details>
510
+
511
+
512
+
513
+ ### Citing ESPnet
514
+
515
+ ```BibTex
516
+ @inproceedings{watanabe2018espnet,
517
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
518
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
519
+ year={2018},
520
+ booktitle={Proceedings of Interspeech},
521
+ pages={2207--2211},
522
+ doi={10.21437/Interspeech.2018-1456},
523
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
524
+ }
525
+
526
+
527
+
528
+
529
+
530
+
531
+ @inproceedings{shi22d_interspeech,
532
+ author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
533
+ title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
534
+ year=2022,
535
+ booktitle={Proc. Interspeech 2022},
536
+ pages={4277--4281},
537
+ doi={10.21437/Interspeech.2022-10039}
538
+ }
539
+ ```
540
+
541
+ or arXiv:
542
+
543
+ ```bibtex
544
+ @misc{watanabe2018espnet,
545
+ title={ESPnet: End-to-End Speech Processing Toolkit},
546
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
547
+ year={2018},
548
+ eprint={1804.00015},
549
+ archivePrefix={arXiv},
550
+ primaryClass={cs.CL}
551
+ }
552
+ ```
exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0af3f18f8b910e800d5af007e547f233de55fea0081348958d2edb4c9681bed
3
+ size 1402
exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bfc536c55a496e833af3b67fb52de8ec9e1d9a57bd7d86344513799e725816a
3
+ size 770
exp/svs_visinger_transfer/500epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94545d9c51104c32a75e16195da40d4bfce49cade59c2d9ceb7435f3acb5c5ff
3
+ size 430950107
exp/svs_visinger_transfer/config.yaml ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/transfer_visinger.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/svs_visinger_transfer
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 0
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 500
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param:
68
+ - /ocean/projects/cis210027p/jiatong/svs/espnet/egs2/acesinger/svs1/exp/svs_train_raw_phn_None_zh/train.total_count.best.pth
69
+ ignore_init_mismatch: true
70
+ freeze_param: []
71
+ num_iters_per_epoch: 1000
72
+ batch_size: 8
73
+ valid_batch_size: null
74
+ batch_bins: 1000000
75
+ valid_batch_bins: null
76
+ train_shape_file:
77
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
78
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
79
+ valid_shape_file:
80
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
81
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
82
+ batch_type: sorted
83
+ valid_batch_type: null
84
+ fold_length:
85
+ - 150
86
+ - 409600
87
+ sort_in_batch: descending
88
+ shuffle_within_batch: false
89
+ sort_batch: descending
90
+ multiple_iterator: false
91
+ chunk_length: 500
92
+ chunk_shift_ratio: 0.5
93
+ num_cache_chunks: 1024
94
+ chunk_excluded_key_prefixes: []
95
+ chunk_default_fs: null
96
+ train_data_path_and_name_and_type:
97
+ - - dump/raw/tr_no_dev/text
98
+ - text
99
+ - text
100
+ - - dump/raw/tr_no_dev/wav.scp
101
+ - singing
102
+ - sound
103
+ - - dump/raw/tr_no_dev/label
104
+ - label
105
+ - duration
106
+ - - dump/raw/tr_no_dev/score.scp
107
+ - score
108
+ - score
109
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
110
+ - pitch
111
+ - npy
112
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
113
+ - feats
114
+ - npy
115
+ valid_data_path_and_name_and_type:
116
+ - - dump/raw/dev/text
117
+ - text
118
+ - text
119
+ - - dump/raw/dev/wav.scp
120
+ - singing
121
+ - sound
122
+ - - dump/raw/dev/label
123
+ - label
124
+ - duration
125
+ - - dump/raw/dev/score.scp
126
+ - score
127
+ - score
128
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
129
+ - pitch
130
+ - npy
131
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
132
+ - feats
133
+ - npy
134
+ allow_variable_data_keys: false
135
+ max_cache_size: 0.0
136
+ max_cache_fd: 32
137
+ allow_multi_rates: false
138
+ valid_max_cache_size: null
139
+ exclude_weight_decay: false
140
+ exclude_weight_decay_conf: {}
141
+ optim: adamw
142
+ optim_conf:
143
+ lr: 0.0002
144
+ betas:
145
+ - 0.8
146
+ - 0.99
147
+ eps: 1.0e-09
148
+ weight_decay: 0.0
149
+ scheduler: exponentiallr
150
+ scheduler_conf:
151
+ gamma: 0.998
152
+ optim2: adamw
153
+ optim2_conf:
154
+ lr: 0.0002
155
+ betas:
156
+ - 0.8
157
+ - 0.99
158
+ eps: 1.0e-09
159
+ weight_decay: 0.0
160
+ scheduler2: exponentiallr
161
+ scheduler2_conf:
162
+ gamma: 0.998
163
+ generator_first: true
164
+ token_list:
165
+ - <blank>
166
+ - <unk>
167
+ - SP
168
+ - i
169
+ - AP
170
+ - e
171
+ - y
172
+ - d
173
+ - w
174
+ - sh
175
+ - ai
176
+ - n
177
+ - x
178
+ - j
179
+ - ian
180
+ - u
181
+ - l
182
+ - h
183
+ - b
184
+ - o
185
+ - zh
186
+ - an
187
+ - ou
188
+ - m
189
+ - q
190
+ - z
191
+ - en
192
+ - g
193
+ - ing
194
+ - ei
195
+ - ao
196
+ - ang
197
+ - uo
198
+ - eng
199
+ - t
200
+ - a
201
+ - ong
202
+ - ui
203
+ - k
204
+ - f
205
+ - r
206
+ - iang
207
+ - ch
208
+ - v
209
+ - in
210
+ - iao
211
+ - ie
212
+ - iu
213
+ - c
214
+ - s
215
+ - van
216
+ - p
217
+ - ve
218
+ - uan
219
+ - uang
220
+ - ia
221
+ - ua
222
+ - uai
223
+ - un
224
+ - er
225
+ - vn
226
+ - iong
227
+ - <sos/eos>
228
+ odim: null
229
+ model_conf: {}
230
+ use_preprocessor: true
231
+ token_type: phn
232
+ bpemodel: null
233
+ non_linguistic_symbols: null
234
+ cleaner: null
235
+ g2p: null
236
+ fs: 44100
237
+ score_feats_extract: syllable_score_feats
238
+ score_feats_extract_conf:
239
+ fs: 44100
240
+ n_fft: 2048
241
+ win_length: 2048
242
+ hop_length: 512
243
+ feats_extract: fbank
244
+ feats_extract_conf:
245
+ n_fft: 2048
246
+ hop_length: 512
247
+ win_length: 2048
248
+ fs: 44100
249
+ fmin: 0
250
+ fmax: 22050
251
+ n_mels: 80
252
+ normalize: global_mvn
253
+ normalize_conf:
254
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
255
+ svs: vits
256
+ svs_conf:
257
+ generator_type: visinger
258
+ vocoder_generator_type: hifigan
259
+ generator_params:
260
+ hidden_channels: 192
261
+ spks: -1
262
+ global_channels: 256
263
+ segment_size: 20
264
+ text_encoder_attention_heads: 2
265
+ text_encoder_ffn_expand: 4
266
+ text_encoder_blocks: 6
267
+ text_encoder_positionwise_layer_type: conv1d
268
+ text_encoder_positionwise_conv_kernel_size: 3
269
+ text_encoder_positional_encoding_layer_type: rel_pos
270
+ text_encoder_self_attention_layer_type: rel_selfattn
271
+ text_encoder_activation_type: swish
272
+ text_encoder_normalize_before: true
273
+ text_encoder_dropout_rate: 0.1
274
+ text_encoder_positional_dropout_rate: 0.0
275
+ text_encoder_attention_dropout_rate: 0.1
276
+ use_macaron_style_in_text_encoder: true
277
+ use_conformer_conv_in_text_encoder: false
278
+ text_encoder_conformer_kernel_size: -1
279
+ decoder_kernel_size: 7
280
+ decoder_channels: 512
281
+ decoder_upsample_scales:
282
+ - 8
283
+ - 8
284
+ - 4
285
+ - 2
286
+ decoder_upsample_kernel_sizes:
287
+ - 16
288
+ - 16
289
+ - 8
290
+ - 4
291
+ decoder_resblock_kernel_sizes:
292
+ - 3
293
+ - 7
294
+ - 11
295
+ decoder_resblock_dilations:
296
+ - - 1
297
+ - 3
298
+ - 5
299
+ - - 1
300
+ - 3
301
+ - 5
302
+ - - 1
303
+ - 3
304
+ - 5
305
+ use_weight_norm_in_decoder: true
306
+ posterior_encoder_kernel_size: 3
307
+ posterior_encoder_layers: 8
308
+ posterior_encoder_stacks: 1
309
+ posterior_encoder_base_dilation: 1
310
+ posterior_encoder_dropout_rate: 0.0
311
+ use_weight_norm_in_posterior_encoder: true
312
+ flow_flows: -1
313
+ flow_kernel_size: 5
314
+ flow_base_dilation: 1
315
+ flow_layers: 4
316
+ flow_dropout_rate: 0.0
317
+ use_weight_norm_in_flow: true
318
+ use_only_mean_in_flow: true
319
+ use_phoneme_predictor: false
320
+ vocabs: 63
321
+ aux_channels: 80
322
+ generator_type: visinger
323
+ vocoder_generator_type: hifigan
324
+ fs: 44100
325
+ hop_length: 512
326
+ win_length: 2048
327
+ n_fft: 2048
328
+ discriminator_type: visinger2
329
+ discriminator_params:
330
+ scales: 1
331
+ scale_downsample_pooling: AvgPool1d
332
+ scale_downsample_pooling_params:
333
+ kernel_size: 4
334
+ stride: 2
335
+ padding: 2
336
+ scale_discriminator_params:
337
+ in_channels: 1
338
+ out_channels: 1
339
+ kernel_sizes:
340
+ - 15
341
+ - 41
342
+ - 5
343
+ - 3
344
+ channels: 128
345
+ max_downsample_channels: 1024
346
+ max_groups: 256
347
+ bias: true
348
+ downsample_scales:
349
+ - 4
350
+ - 4
351
+ - 4
352
+ - 4
353
+ nonlinear_activation: LeakyReLU
354
+ nonlinear_activation_params:
355
+ negative_slope: 0.1
356
+ use_weight_norm: true
357
+ use_spectral_norm: false
358
+ follow_official_norm: false
359
+ periods:
360
+ - 2
361
+ - 3
362
+ - 5
363
+ - 7
364
+ - 11
365
+ period_discriminator_params:
366
+ in_channels: 1
367
+ out_channels: 1
368
+ kernel_sizes:
369
+ - 5
370
+ - 3
371
+ channels: 32
372
+ downsample_scales:
373
+ - 3
374
+ - 3
375
+ - 3
376
+ - 3
377
+ - 1
378
+ max_downsample_channels: 1024
379
+ bias: true
380
+ nonlinear_activation: LeakyReLU
381
+ nonlinear_activation_params:
382
+ negative_slope: 0.1
383
+ use_weight_norm: true
384
+ use_spectral_norm: false
385
+ multi_freq_disc_params:
386
+ hidden_channels:
387
+ - 256
388
+ - 256
389
+ - 256
390
+ - 256
391
+ - 256
392
+ domain: double
393
+ mel_scale: true
394
+ divisors:
395
+ - 32
396
+ - 16
397
+ - 8
398
+ - 4
399
+ - 2
400
+ - 1
401
+ - 1
402
+ strides:
403
+ - 1
404
+ - 2
405
+ - 1
406
+ - 2
407
+ - 1
408
+ - 2
409
+ - 1
410
+ sample_rate: 44100
411
+ hop_lengths:
412
+ - 110
413
+ - 220
414
+ - 330
415
+ - 441
416
+ - 551
417
+ - 661
418
+ generator_adv_loss_params:
419
+ average_by_discriminators: false
420
+ loss_type: mse
421
+ discriminator_adv_loss_params:
422
+ average_by_discriminators: false
423
+ loss_type: mse
424
+ feat_match_loss_params:
425
+ average_by_discriminators: false
426
+ average_by_layers: false
427
+ include_final_outputs: true
428
+ mel_loss_params:
429
+ fs: 44100
430
+ n_fft: 2048
431
+ hop_length: 512
432
+ win_length: 2048
433
+ window: hann
434
+ n_mels: 80
435
+ fmin: 0
436
+ fmax: 22050
437
+ log_base: null
438
+ lambda_adv: 1.0
439
+ lambda_mel: 45.0
440
+ lambda_feat_match: 2.0
441
+ lambda_dur: 0.1
442
+ lambda_pitch: 10.0
443
+ lambda_phoneme: 1.0
444
+ lambda_kl: 1.0
445
+ sampling_rate: 44100
446
+ cache_generator_outputs: true
447
+ pitch_extract: dio
448
+ pitch_extract_conf:
449
+ use_token_averaged_f0: false
450
+ use_log_f0: false
451
+ fs: 44100
452
+ n_fft: 2048
453
+ hop_length: 512
454
+ f0max: 800
455
+ f0min: 80
456
+ pitch_normalize: null
457
+ pitch_normalize_conf:
458
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
459
+ ying_extract: null
460
+ ying_extract_conf: {}
461
+ energy_extract: null
462
+ energy_extract_conf: {}
463
+ energy_normalize: null
464
+ energy_normalize_conf: {}
465
+ required:
466
+ - output_dir
467
+ - token_list
468
+ version: '202310'
469
+ distributed: false
exp/svs_visinger_transfer/images/discriminator_backward_time.png ADDED
exp/svs_visinger_transfer/images/discriminator_fake_loss.png ADDED
exp/svs_visinger_transfer/images/discriminator_forward_time.png ADDED
exp/svs_visinger_transfer/images/discriminator_loss.png ADDED
exp/svs_visinger_transfer/images/discriminator_optim_step_time.png ADDED
exp/svs_visinger_transfer/images/discriminator_real_loss.png ADDED
exp/svs_visinger_transfer/images/discriminator_train_time.png ADDED
exp/svs_visinger_transfer/images/generator_adv_loss.png ADDED
exp/svs_visinger_transfer/images/generator_backward_time.png ADDED
exp/svs_visinger_transfer/images/generator_feat_match_loss.png ADDED
exp/svs_visinger_transfer/images/generator_forward_time.png ADDED
exp/svs_visinger_transfer/images/generator_kl_loss.png ADDED
exp/svs_visinger_transfer/images/generator_loss.png ADDED
exp/svs_visinger_transfer/images/generator_mel_loss.png ADDED
exp/svs_visinger_transfer/images/generator_optim_step_time.png ADDED
exp/svs_visinger_transfer/images/generator_phn_dur_loss.png ADDED
exp/svs_visinger_transfer/images/generator_pitch_loss.png ADDED
exp/svs_visinger_transfer/images/generator_score_dur_loss.png ADDED
exp/svs_visinger_transfer/images/generator_train_time.png ADDED
exp/svs_visinger_transfer/images/gpu_max_cached_mem_GB.png ADDED
exp/svs_visinger_transfer/images/iter_time.png ADDED
exp/svs_visinger_transfer/images/optim0_lr0.png ADDED
exp/svs_visinger_transfer/images/optim1_lr0.png ADDED
exp/svs_visinger_transfer/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202310'
2
+ files:
3
+ model_file: exp/svs_visinger_transfer/500epoch.pth
4
+ python: "3.9.16 (main, Mar 8 2023, 14:00:05) \n[GCC 11.2.0]"
5
+ timestamp: 1702801693.375856
6
+ torch: 1.13.1+cu117
7
+ yaml_files:
8
+ train_config: exp/svs_visinger_transfer/config.yaml