Spaces:
Sleeping
Sleeping
File size: 3,994 Bytes
14d1720 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
dataset:
train:
wav_scp: './train/wav.scp'
mel_scp: './train/mel.scp'
dur_scp: './train/dur.scp'
emb_type1:
_name: 'pinyin'
scp: './train/py.scp'
vocab: 'py.vocab'
emb_type2:
_name: 'graphic'
scp: './train/gp.scp'
vocab: 'gp.vocab'
emb_type3:
_name: 'speaker'
scp: './train/spk.scp'
vocab: ~ # dosn't need vocab
# NOTE: you can add more embedding here without changing the code.
eval:
# NOTE: this is not used for now, i.e., just training, no evaluation.
# You can use synthesize.py to check the training goes well, for now.
training:
batch_size: 1024
batch_split: 64
epochs: 100000
grad_clip_thresh: 1.0
acc_step: 1
checkpoint_path: "./checkpoints/"
log_path: "./log/"
checkpoint_step: 5000
synth_step: 5000
log_step: 20
num_workers: 8
evaluation_step: 1000
optimizer: # NOTE: if use SGD, params should change too, as it has different arguments.
type: Adam
n_warm_up_step: 2000
lr_decrease_step: 10000
lr_decrease_factor:
params:
betas: [0.9,0.98]
eps: !!float 1e-9
weight_decay: !!float 0.0
lr: !!float 1e-4
lr_scheduler:
type: CyclicLR
params:
base_lr: !!float 1e-8
max_lr: !!float 1e-6
step_size_up: 5000
step_size_down: 5000
cycle_momentum: False
fbank: # this is used for wav2mel.py
sample_rate: 22050
n_fft: 1024
hop_length: 256
win_length: 1024
max_wav_value: 32768.0
n_mels: 80
fmin: 0.0
fmax: 8000.0 # should be 11025 ?
mel_mean: -6.0304103
encoder:
encoder_type: 'FS2TransformerEncoder'
conf:
n_layers: 4
n_heads: 2
hidden_dim: 256
dropout: 0.25
d_inner: 1024
max_len: 2048
decoder:
decoder_type: 'FS2TransformerDecoder'
input_dim: 256 # should be the same as the output of encoder
n_layers: 4
n_heads: 2
hidden_dim: 256
d_inner: 1024
dropout: 0.25
max_len: 2048 # max len of seq, for position embedding pre-computation
#(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2,
#hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048
postnet:
postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d'
speaker_embedding:
enable: True
vocab: #None
vocab_size: 218 # aishell3 has 218 speakers
weight: 1.0 # you can play with weight here
dim: 256
utterence_embedding:
enable: False # not implemented
type: 'lstm' # resnet
feature_config:
type: 'mel'
n_mels: 80
sampling_rate: 22050
n_fft: 1024
hop_length: 256
win_length: 1024
hanzi_embedding:
enable: True
type: embedding
vocab: './gp.vocab'
dim: 256
weight: 0.5 # you can play with weight here
max_seq_len: 100
pinyin_embedding:
enable: True
type: embedding
vocab: './py.vocab'
dim: 256
weight: 1.0
max_seq_len: 100
duration_predictor:
input_dim: 256 # should be the same as encoder hiddien_dim
filter_size: 256
kernel_size: 3
dropout: 0.15 # important to set dropout here
duration_mean: 21.517294924096635 #for aishell3
f0_predictor:
enable: False # currently not supported
filter_size: 256
kernel_size: 3
dropout: 0.5
n_bins: 256
vocoder:
type: VocGan # choose one of the following
MelGAN:
checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth
config: ~/checkpoints/melgan/default.yaml
device: cpu
VocGan:
checkpoint: ~/checkpoints/vctk_pretrained_model_3180.pt #~/checkpoints/ljspeech_29de09d_4000.pt
denoise: True
device: cpu
HiFiGAN:
checkpoint: ~/checkpoints/VCTK_V3/generator_v3 # you need to download checkpoint and set the params here
device: cpu
Waveglow:
checkpoint: ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt
sigma: 1.0
denoiser_strength: 0.0 # try 0.1
device: cpu #try cpu if out of memory
synthesis:
normalize: True # normalize the sound volume
|