Spaces:

waysolong
/

text_to_speech

Sleeping

File size: 3,510 Bytes


dataset:
  train:
    wav_scp: './train/wav.scp'
    mel_scp: './train/mel.scp'
    dur_scp: './train/dur.scp'
    emb_type1:
      _name: 'pinyin'
      scp: './train/py.scp'
      vocab: 'py.vocab'
    emb_type2:
      _name: 'graphic'
      scp: './train/gp.scp'
      vocab: 'gp.vocab'

    emb_type3:
      _name: 'speaker'
      scp: './train/spk.scp'
      vocab: # dosn't need vocab

training:
  batch_size: 16
  batch_split: 1
  epochs: 10000
  grad_clip_thresh: 1.0
  acc_step: 1
  checkpoint_path: "./checkpoints/"
  log_path: "./log/"
  checkpoint_step: 5000
  synth_step: 5000
  log_step: 20
  num_workers: 8

  evaluation_step: 1000

optimizer:
    type: Adam
    n_warm_up_step: 2000
    #lr_decrease_step: 10000
    #lr_decrease_factor:
    params:
      betas: [0.9,0.98]
      eps: !!float 1e-9
      weight_decay: !!float 0.0
      lr: !!float  1e-4
lr_scheduler:
    type: CyclicLR
    params:
      base_lr: !!float 1e-7
      max_lr: !!float 1e-4
      step_size_up: 5000
      step_size_down: 8000
      cycle_momentum: False

vocoder:
  type: VocGan # choose one of the following
  MelGAN:
    checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth
    config: ~/checkpoints/melgan/default.yaml
    device: cpu
  VocGan:
    checkpoint: checkpoints #~/checkpoints/ljspeech_29de09d_4000.pt
    denoise: True
    device: cpu
  HiFiGAN:
    checkpoint: ~/checkpoints/VCTK_V3/generator_v3  # you need to download checkpoint and set the params here
    device: cpu
  Waveglow:
    checkpoint:  ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt
    sigma: 1.0
    denoiser_strength: 0.0 # try 0.1
    device: cpu #try cpu if out of memory

fbank:
  sample_rate: 16000
  n_fft: 1024
  hop_length: 256
  win_length: 1024
  max_wav_value: 32768.0
  n_mels: 80
  fmin: 0.0
  fmax: 8000.0 # should be 11025
  mel_mean: -6.0304103

encoder:
    encoder_type: 'FS2TransformerEncoder'
    conf:
      n_layers: 4
      n_heads: 2
      hidden_dim: 256
      dropout: 0.25
      d_inner: 1024
      max_len: 2048

decoder:
    decoder_type: 'FS2TransformerDecoder'
    input_dim: 256 # should be the same as the output of encoder
    n_layers: 4
    n_heads: 2
    hidden_dim: 256
    d_inner: 1024
    dropout: 0.25
    max_len: 2048 # max len of seq, for position embedding pre-computation

#(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2,
#hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048
postnet:
  postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d'
speaker_embedding:
    enable: True
    vocab: #None
    vocab_size: 1 #
    weight: 1.0
    dim: 256

utterence_embedding:
    enable: False
    type: 'lstm' # resnet
    feature_config:
      type: 'mel'
      n_mels: 80
      sampling_rate: 22050
      n_fft: 1024
      hop_length: 256
      win_length: 1024

    model_config:
      n_layers: 3
      bidirectional: True

hanzi_embedding:
    enable: True
    type: embedding
    vocab: './gp.vocab'
    dim: 256
    weight: 0.5
    max_seq_len: 100

pinyin_embedding:
    enable: True
    type: embedding
    vocab: './py.vocab'
    dim: 256
    weight: 1.0
    max_seq_len: 100

duration_predictor:
  input_dim: 256 # should be the same as encoder hiddien_dim
  filter_size: 256
  kernel_size: 3
  dropout: 0.5
  duration_mean: 21.517294924096635 #for aishell3

f0_predictor:
  enable: False
  filter_size: 256
  kernel_size: 3
  dropout: 0.5
  n_bins: 256
synthesis:
  normalize: True # normalize the sound volume