Spaces:

waysolong
/

text_to_speech

Sleeping

File size: 3,994 Bytes

14d1720


dataset:
  train:
    wav_scp: './train/wav.scp'
    mel_scp: './train/mel.scp'
    dur_scp: './train/dur.scp'
    emb_type1:
      _name: 'pinyin'
      scp: './train/py.scp'
      vocab: 'py.vocab'
    emb_type2:
      _name: 'graphic'
      scp: './train/gp.scp'
      vocab: 'gp.vocab'
    emb_type3:
      _name: 'speaker'
      scp: './train/spk.scp'
      vocab: ~ # dosn't need vocab
    # NOTE: you can add more embedding here without changing the code.
  eval:
    # NOTE: this is not used for now, i.e., just training, no evaluation.
    # You can use synthesize.py to check the training goes well, for now.



training:
  batch_size: 1024
  batch_split: 64

  epochs: 100000
  grad_clip_thresh: 1.0
  acc_step: 1
  checkpoint_path: "./checkpoints/"
  log_path: "./log/"
  checkpoint_step: 5000
  synth_step: 5000
  log_step: 20
  num_workers: 8

  evaluation_step: 1000

optimizer: # NOTE: if use SGD, params should change too, as it has different arguments.
    type: Adam
    n_warm_up_step: 2000
    lr_decrease_step: 10000
    lr_decrease_factor:
    params:
      betas: [0.9,0.98]
      eps: !!float 1e-9
      weight_decay: !!float 0.0
      lr: !!float  1e-4
lr_scheduler:
    type: CyclicLR
    params:
      base_lr: !!float 1e-8
      max_lr: !!float 1e-6
      step_size_up: 5000
      step_size_down: 5000
      cycle_momentum: False

fbank: # this is used for wav2mel.py
  sample_rate: 22050
  n_fft: 1024
  hop_length: 256
  win_length: 1024
  max_wav_value: 32768.0
  n_mels: 80
  fmin: 0.0
  fmax: 8000.0 # should be 11025 ?
  mel_mean: -6.0304103

encoder:
    encoder_type: 'FS2TransformerEncoder'
    conf:
      n_layers: 4
      n_heads: 2
      hidden_dim: 256
      dropout: 0.25
      d_inner: 1024
      max_len: 2048

decoder:
    decoder_type: 'FS2TransformerDecoder'
    input_dim: 256 # should be the same as the output of encoder
    n_layers: 4
    n_heads: 2
    hidden_dim: 256
    d_inner: 1024
    dropout: 0.25
    max_len: 2048 # max len of seq, for position embedding pre-computation

#(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2,
#hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048
postnet:
  postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d'

speaker_embedding:
    enable: True
    vocab: #None
    vocab_size: 218 # aishell3 has 218 speakers
    weight: 1.0 # you can play with weight here
    dim: 256

utterence_embedding:
    enable: False # not implemented
    type: 'lstm' # resnet
    feature_config:
      type: 'mel'
      n_mels: 80
      sampling_rate: 22050
      n_fft: 1024
      hop_length: 256
      win_length: 1024

hanzi_embedding:
    enable: True
    type: embedding
    vocab: './gp.vocab'
    dim: 256
    weight: 0.5 # you can play with weight here
    max_seq_len: 100

pinyin_embedding:
    enable: True
    type: embedding
    vocab: './py.vocab'
    dim: 256
    weight: 1.0
    max_seq_len: 100

duration_predictor:
  input_dim: 256 # should be the same as encoder hiddien_dim
  filter_size: 256
  kernel_size: 3
  dropout: 0.15 # important to set dropout here
  duration_mean: 21.517294924096635 #for aishell3

f0_predictor:
  enable: False # currently not supported
  filter_size: 256
  kernel_size: 3
  dropout: 0.5
  n_bins: 256

vocoder:
  type: VocGan # choose one of the following
  MelGAN:
    checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth
    config: ~/checkpoints/melgan/default.yaml
    device: cpu
  VocGan:
    checkpoint: ~/checkpoints/vctk_pretrained_model_3180.pt #~/checkpoints/ljspeech_29de09d_4000.pt
    denoise: True
    device: cpu
  HiFiGAN:
    checkpoint: ~/checkpoints/VCTK_V3/generator_v3  # you need to download checkpoint and set the params here
    device: cpu
  Waveglow:
    checkpoint:  ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt
    sigma: 1.0
    denoiser_strength: 0.0 # try 0.1
    device: cpu #try cpu if out of memory



synthesis:
  normalize: True # normalize the sound volume