Spaces:
Running
Running
base_config: configs/tts/base.yaml | |
task_cls: tasks.tts.fs2.FastSpeech2Task | |
# model | |
hidden_size: 256 | |
dropout: 0.1 | |
encoder_type: fft # fft|tacotron|tacotron2|conformer | |
encoder_K: 8 # for tacotron encoder | |
decoder_type: fft # fft|rnn|conv|conformer | |
use_pos_embed: true | |
# duration | |
predictor_hidden: -1 | |
predictor_kernel: 5 | |
predictor_layers: 2 | |
dur_predictor_kernel: 3 | |
dur_predictor_layers: 2 | |
predictor_dropout: 0.5 | |
# pitch and energy | |
use_pitch_embed: true | |
pitch_type: ph # frame|ph|cwt | |
use_uv: true | |
cwt_hidden_size: 128 | |
cwt_layers: 2 | |
cwt_loss: l1 | |
cwt_add_f0_loss: false | |
cwt_std_scale: 0.8 | |
pitch_ar: false | |
#pitch_embed_type: 0q | |
pitch_loss: 'l1' # l1|l2|ssim | |
pitch_norm: log | |
use_energy_embed: false | |
# reference encoder and speaker embedding | |
use_spk_id: false | |
use_split_spk_id: false | |
use_spk_embed: false | |
use_var_enc: false | |
lambda_commit: 0.25 | |
ref_norm_layer: bn | |
pitch_enc_hidden_stride_kernel: | |
- 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size | |
- 0,2,5 | |
- 0,2,5 | |
dur_enc_hidden_stride_kernel: | |
- 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size | |
- 0,2,3 | |
- 0,1,3 | |
# mel | |
mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5 | |
# loss lambda | |
lambda_f0: 1.0 | |
lambda_uv: 1.0 | |
lambda_energy: 0.1 | |
lambda_ph_dur: 1.0 | |
lambda_sent_dur: 1.0 | |
lambda_word_dur: 1.0 | |
predictor_grad: 0.1 | |
# train and eval | |
pretrain_fs_ckpt: '' | |
warmup_updates: 2000 | |
max_tokens: 32000 | |
max_sentences: 100000 | |
max_eval_sentences: 1 | |
max_updates: 120000 | |
num_valid_plots: 5 | |
num_test_samples: 0 | |
test_ids: [] | |
use_gt_dur: false | |
use_gt_f0: false | |
# exp | |
dur_loss: mse # huber|mol | |
norm_type: gn |