Spaces:
Sleeping
Sleeping
dataset: | |
train: | |
wav_scp: './train/wav.scp' | |
mel_scp: './train/mel.scp' | |
dur_scp: './train/dur.scp' | |
emb_type1: | |
_name: 'pinyin' | |
scp: './train/py.scp' | |
vocab: 'py.vocab' | |
emb_type2: | |
_name: 'graphic' | |
scp: './train/gp.scp' | |
vocab: 'gp.vocab' | |
emb_type3: | |
_name: 'speaker' | |
scp: './train/spk.scp' | |
vocab: # dosn't need vocab | |
training: | |
batch_size: 16 | |
batch_split: 1 | |
epochs: 10000 | |
grad_clip_thresh: 1.0 | |
acc_step: 1 | |
checkpoint_path: "./checkpoints/" | |
log_path: "./log/" | |
checkpoint_step: 5000 | |
synth_step: 5000 | |
log_step: 20 | |
num_workers: 8 | |
evaluation_step: 1000 | |
optimizer: | |
type: Adam | |
n_warm_up_step: 2000 | |
#lr_decrease_step: 10000 | |
#lr_decrease_factor: | |
params: | |
betas: [0.9,0.98] | |
eps: !!float 1e-9 | |
weight_decay: !!float 0.0 | |
lr: !!float 1e-4 | |
lr_scheduler: | |
type: CyclicLR | |
params: | |
base_lr: !!float 1e-7 | |
max_lr: !!float 1e-4 | |
step_size_up: 5000 | |
step_size_down: 8000 | |
cycle_momentum: False | |
vocoder: | |
type: VocGan # choose one of the following | |
MelGAN: | |
checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth | |
config: ~/checkpoints/melgan/default.yaml | |
device: cpu | |
VocGan: | |
checkpoint: checkpoints #~/checkpoints/ljspeech_29de09d_4000.pt | |
denoise: True | |
device: cpu | |
HiFiGAN: | |
checkpoint: ~/checkpoints/VCTK_V3/generator_v3 # you need to download checkpoint and set the params here | |
device: cpu | |
Waveglow: | |
checkpoint: ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt | |
sigma: 1.0 | |
denoiser_strength: 0.0 # try 0.1 | |
device: cpu #try cpu if out of memory | |
fbank: | |
sample_rate: 16000 | |
n_fft: 1024 | |
hop_length: 256 | |
win_length: 1024 | |
max_wav_value: 32768.0 | |
n_mels: 80 | |
fmin: 0.0 | |
fmax: 8000.0 # should be 11025 | |
mel_mean: -6.0304103 | |
encoder: | |
encoder_type: 'FS2TransformerEncoder' | |
conf: | |
n_layers: 4 | |
n_heads: 2 | |
hidden_dim: 256 | |
dropout: 0.25 | |
d_inner: 1024 | |
max_len: 2048 | |
decoder: | |
decoder_type: 'FS2TransformerDecoder' | |
input_dim: 256 # should be the same as the output of encoder | |
n_layers: 4 | |
n_heads: 2 | |
hidden_dim: 256 | |
d_inner: 1024 | |
dropout: 0.25 | |
max_len: 2048 # max len of seq, for position embedding pre-computation | |
#(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2, | |
#hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048 | |
postnet: | |
postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d' | |
speaker_embedding: | |
enable: True | |
vocab: #None | |
vocab_size: 1 # | |
weight: 1.0 | |
dim: 256 | |
utterence_embedding: | |
enable: False | |
type: 'lstm' # resnet | |
feature_config: | |
type: 'mel' | |
n_mels: 80 | |
sampling_rate: 22050 | |
n_fft: 1024 | |
hop_length: 256 | |
win_length: 1024 | |
model_config: | |
n_layers: 3 | |
bidirectional: True | |
hanzi_embedding: | |
enable: True | |
type: embedding | |
vocab: './gp.vocab' | |
dim: 256 | |
weight: 0.5 | |
max_seq_len: 100 | |
pinyin_embedding: | |
enable: True | |
type: embedding | |
vocab: './py.vocab' | |
dim: 256 | |
weight: 1.0 | |
max_seq_len: 100 | |
duration_predictor: | |
input_dim: 256 # should be the same as encoder hiddien_dim | |
filter_size: 256 | |
kernel_size: 3 | |
dropout: 0.5 | |
duration_mean: 21.517294924096635 #for aishell3 | |
f0_predictor: | |
enable: False | |
filter_size: 256 | |
kernel_size: 3 | |
dropout: 0.5 | |
n_bins: 256 | |
synthesis: | |
normalize: True # normalize the sound volume | |