Spaces:
Sleeping
Sleeping
File size: 3,510 Bytes
14d1720 9270314 14d1720 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
dataset:
train:
wav_scp: './train/wav.scp'
mel_scp: './train/mel.scp'
dur_scp: './train/dur.scp'
emb_type1:
_name: 'pinyin'
scp: './train/py.scp'
vocab: 'py.vocab'
emb_type2:
_name: 'graphic'
scp: './train/gp.scp'
vocab: 'gp.vocab'
emb_type3:
_name: 'speaker'
scp: './train/spk.scp'
vocab: # dosn't need vocab
training:
batch_size: 16
batch_split: 1
epochs: 10000
grad_clip_thresh: 1.0
acc_step: 1
checkpoint_path: "./checkpoints/"
log_path: "./log/"
checkpoint_step: 5000
synth_step: 5000
log_step: 20
num_workers: 8
evaluation_step: 1000
optimizer:
type: Adam
n_warm_up_step: 2000
#lr_decrease_step: 10000
#lr_decrease_factor:
params:
betas: [0.9,0.98]
eps: !!float 1e-9
weight_decay: !!float 0.0
lr: !!float 1e-4
lr_scheduler:
type: CyclicLR
params:
base_lr: !!float 1e-7
max_lr: !!float 1e-4
step_size_up: 5000
step_size_down: 8000
cycle_momentum: False
vocoder:
type: VocGan # choose one of the following
MelGAN:
checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth
config: ~/checkpoints/melgan/default.yaml
device: cpu
VocGan:
checkpoint: checkpoints #~/checkpoints/ljspeech_29de09d_4000.pt
denoise: True
device: cpu
HiFiGAN:
checkpoint: ~/checkpoints/VCTK_V3/generator_v3 # you need to download checkpoint and set the params here
device: cpu
Waveglow:
checkpoint: ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt
sigma: 1.0
denoiser_strength: 0.0 # try 0.1
device: cpu #try cpu if out of memory
fbank:
sample_rate: 16000
n_fft: 1024
hop_length: 256
win_length: 1024
max_wav_value: 32768.0
n_mels: 80
fmin: 0.0
fmax: 8000.0 # should be 11025
mel_mean: -6.0304103
encoder:
encoder_type: 'FS2TransformerEncoder'
conf:
n_layers: 4
n_heads: 2
hidden_dim: 256
dropout: 0.25
d_inner: 1024
max_len: 2048
decoder:
decoder_type: 'FS2TransformerDecoder'
input_dim: 256 # should be the same as the output of encoder
n_layers: 4
n_heads: 2
hidden_dim: 256
d_inner: 1024
dropout: 0.25
max_len: 2048 # max len of seq, for position embedding pre-computation
#(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2,
#hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048
postnet:
postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d'
speaker_embedding:
enable: True
vocab: #None
vocab_size: 1 #
weight: 1.0
dim: 256
utterence_embedding:
enable: False
type: 'lstm' # resnet
feature_config:
type: 'mel'
n_mels: 80
sampling_rate: 22050
n_fft: 1024
hop_length: 256
win_length: 1024
model_config:
n_layers: 3
bidirectional: True
hanzi_embedding:
enable: True
type: embedding
vocab: './gp.vocab'
dim: 256
weight: 0.5
max_seq_len: 100
pinyin_embedding:
enable: True
type: embedding
vocab: './py.vocab'
dim: 256
weight: 1.0
max_seq_len: 100
duration_predictor:
input_dim: 256 # should be the same as encoder hiddien_dim
filter_size: 256
kernel_size: 3
dropout: 0.5
duration_mean: 21.517294924096635 #for aishell3
f0_predictor:
enable: False
filter_size: 256
kernel_size: 3
dropout: 0.5
n_bins: 256
synthesis:
normalize: True # normalize the sound volume
|