dataset: train: wav_scp: './train/wav.scp' mel_scp: './train/mel.scp' dur_scp: './train/dur.scp' emb_type1: _name: 'pinyin' scp: './train/py.scp' vocab: 'py.vocab' emb_type2: _name: 'graphic' scp: './train/gp.scp' vocab: 'gp.vocab' emb_type3: _name: 'speaker' scp: './train/spk.scp' vocab: # dosn't need vocab training: batch_size: 16 batch_split: 1 epochs: 10000 grad_clip_thresh: 1.0 acc_step: 1 checkpoint_path: "./checkpoints/" log_path: "./log/" checkpoint_step: 5000 synth_step: 5000 log_step: 20 num_workers: 8 evaluation_step: 1000 optimizer: type: Adam n_warm_up_step: 2000 #lr_decrease_step: 10000 #lr_decrease_factor: params: betas: [0.9,0.98] eps: !!float 1e-9 weight_decay: !!float 0.0 lr: !!float 1e-4 lr_scheduler: type: CyclicLR params: base_lr: !!float 1e-7 max_lr: !!float 1e-4 step_size_up: 5000 step_size_down: 8000 cycle_momentum: False vocoder: type: VocGan # choose one of the following MelGAN: checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth config: ~/checkpoints/melgan/default.yaml device: cpu VocGan: checkpoint: checkpoints #~/checkpoints/ljspeech_29de09d_4000.pt denoise: True device: cpu HiFiGAN: checkpoint: ~/checkpoints/VCTK_V3/generator_v3 # you need to download checkpoint and set the params here device: cpu Waveglow: checkpoint: ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt sigma: 1.0 denoiser_strength: 0.0 # try 0.1 device: cpu #try cpu if out of memory fbank: sample_rate: 16000 n_fft: 1024 hop_length: 256 win_length: 1024 max_wav_value: 32768.0 n_mels: 80 fmin: 0.0 fmax: 8000.0 # should be 11025 mel_mean: -6.0304103 encoder: encoder_type: 'FS2TransformerEncoder' conf: n_layers: 4 n_heads: 2 hidden_dim: 256 dropout: 0.25 d_inner: 1024 max_len: 2048 decoder: decoder_type: 'FS2TransformerDecoder' input_dim: 256 # should be the same as the output of encoder n_layers: 4 n_heads: 2 hidden_dim: 256 d_inner: 1024 dropout: 0.25 max_len: 2048 # max len of seq, for position embedding pre-computation #(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2, #hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048 postnet: postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d' speaker_embedding: enable: True vocab: #None vocab_size: 1 # weight: 1.0 dim: 256 utterence_embedding: enable: False type: 'lstm' # resnet feature_config: type: 'mel' n_mels: 80 sampling_rate: 22050 n_fft: 1024 hop_length: 256 win_length: 1024 model_config: n_layers: 3 bidirectional: True hanzi_embedding: enable: True type: embedding vocab: './gp.vocab' dim: 256 weight: 0.5 max_seq_len: 100 pinyin_embedding: enable: True type: embedding vocab: './py.vocab' dim: 256 weight: 1.0 max_seq_len: 100 duration_predictor: input_dim: 256 # should be the same as encoder hiddien_dim filter_size: 256 kernel_size: 3 dropout: 0.5 duration_mean: 21.517294924096635 #for aishell3 f0_predictor: enable: False filter_size: 256 kernel_size: 3 dropout: 0.5 n_bins: 256 synthesis: normalize: True # normalize the sound volume