|
audio: |
|
eps: 1e-10 |
|
fft_size: 2048 |
|
filter_length: 1200 |
|
hop_size: 300 |
|
log_base: 10.0 |
|
mel_fmax: 7600 |
|
mel_fmin: 80 |
|
num_mels: 80 |
|
sampling_rate: 24000 |
|
win_length: 1200 |
|
window: hann |
|
lang: |
|
- de |
|
- en |
|
model: |
|
decoder: |
|
conv_filter_size: 1024 |
|
conv_kernel_size: |
|
- 9 |
|
- 1 |
|
dropout: 0.2 |
|
kind: styletts |
|
n_head: 2 |
|
n_layers: 6 |
|
scln: true |
|
emb_dim: 512 |
|
emb_reduction: 1 |
|
encoder: |
|
fs2_dropout: 0.2 |
|
fs2_head: 2 |
|
fs2_layer: 4 |
|
ve_n_bins: 256 |
|
vp_dropout: 0.5 |
|
vp_filter_size: 256 |
|
vp_kernel_size: 3 |
|
max_seq_len: 1500 |
|
punct_emb_dim: 16 |
|
resnet: |
|
encoder_type: ASP |
|
layers: |
|
- 3 |
|
- 4 |
|
- 6 |
|
- 3 |
|
num_filters: |
|
- 32 |
|
- 64 |
|
- 128 |
|
- 256 |
|
stats: |
|
energy_max: 493.5418701171875 |
|
energy_min: -2.139316514718655 |
|
pitch_max: 883.609245028834 |
|
pitch_min: 57.520125416548694 |
|
|