Comparative-Analysis-of-Speech-Synthesis-Models
/
TensorFlowTTS
/tensorflow_tts
/configs
/fastspeech.py
# -*- coding: utf-8 -*- | |
# Copyright 2020 Minh Nguyen (@dathudeptrai) | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""FastSpeech Config object.""" | |
import collections | |
from tensorflow_tts.configs import BaseConfig | |
from tensorflow_tts.processor.ljspeech import LJSPEECH_SYMBOLS as lj_symbols | |
from tensorflow_tts.processor.kss import KSS_SYMBOLS as kss_symbols | |
from tensorflow_tts.processor.baker import BAKER_SYMBOLS as bk_symbols | |
from tensorflow_tts.processor.libritts import LIBRITTS_SYMBOLS as lbri_symbols | |
from tensorflow_tts.processor.jsut import JSUT_SYMBOLS as jsut_symbols | |
SelfAttentionParams = collections.namedtuple( | |
"SelfAttentionParams", | |
[ | |
"n_speakers", | |
"hidden_size", | |
"num_hidden_layers", | |
"num_attention_heads", | |
"attention_head_size", | |
"intermediate_size", | |
"intermediate_kernel_size", | |
"hidden_act", | |
"output_attentions", | |
"output_hidden_states", | |
"initializer_range", | |
"hidden_dropout_prob", | |
"attention_probs_dropout_prob", | |
"layer_norm_eps", | |
"max_position_embeddings", | |
], | |
) | |
class FastSpeechConfig(BaseConfig): | |
"""Initialize FastSpeech Config.""" | |
def __init__( | |
self, | |
dataset="ljspeech", | |
vocab_size=len(lj_symbols), | |
n_speakers=1, | |
encoder_hidden_size=384, | |
encoder_num_hidden_layers=4, | |
encoder_num_attention_heads=2, | |
encoder_attention_head_size=192, | |
encoder_intermediate_size=1024, | |
encoder_intermediate_kernel_size=3, | |
encoder_hidden_act="mish", | |
decoder_hidden_size=384, | |
decoder_num_hidden_layers=4, | |
decoder_num_attention_heads=2, | |
decoder_attention_head_size=192, | |
decoder_intermediate_size=1024, | |
decoder_intermediate_kernel_size=3, | |
decoder_hidden_act="mish", | |
output_attentions=True, | |
output_hidden_states=True, | |
hidden_dropout_prob=0.1, | |
attention_probs_dropout_prob=0.1, | |
initializer_range=0.02, | |
layer_norm_eps=1e-5, | |
max_position_embeddings=2048, | |
num_duration_conv_layers=2, | |
duration_predictor_filters=256, | |
duration_predictor_kernel_sizes=3, | |
num_mels=80, | |
duration_predictor_dropout_probs=0.1, | |
n_conv_postnet=5, | |
postnet_conv_filters=512, | |
postnet_conv_kernel_sizes=5, | |
postnet_dropout_rate=0.1, | |
**kwargs | |
): | |
"""Init parameters for Fastspeech model.""" | |
# encoder params | |
if dataset == "ljspeech": | |
self.vocab_size = vocab_size | |
elif dataset == "kss": | |
self.vocab_size = len(kss_symbols) | |
elif dataset == "baker": | |
self.vocab_size = len(bk_symbols) | |
elif dataset == "libritts": | |
self.vocab_size = len(lbri_symbols) | |
elif dataset == "jsut": | |
self.vocab_size = len(jsut_symbols) | |
else: | |
raise ValueError("No such dataset: {}".format(dataset)) | |
self.initializer_range = initializer_range | |
self.max_position_embeddings = max_position_embeddings | |
self.n_speakers = n_speakers | |
self.layer_norm_eps = layer_norm_eps | |
# encoder params | |
self.encoder_self_attention_params = SelfAttentionParams( | |
n_speakers=n_speakers, | |
hidden_size=encoder_hidden_size, | |
num_hidden_layers=encoder_num_hidden_layers, | |
num_attention_heads=encoder_num_attention_heads, | |
attention_head_size=encoder_attention_head_size, | |
hidden_act=encoder_hidden_act, | |
intermediate_size=encoder_intermediate_size, | |
intermediate_kernel_size=encoder_intermediate_kernel_size, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
initializer_range=initializer_range, | |
hidden_dropout_prob=hidden_dropout_prob, | |
attention_probs_dropout_prob=attention_probs_dropout_prob, | |
layer_norm_eps=layer_norm_eps, | |
max_position_embeddings=max_position_embeddings, | |
) | |
# decoder params | |
self.decoder_self_attention_params = SelfAttentionParams( | |
n_speakers=n_speakers, | |
hidden_size=decoder_hidden_size, | |
num_hidden_layers=decoder_num_hidden_layers, | |
num_attention_heads=decoder_num_attention_heads, | |
attention_head_size=decoder_attention_head_size, | |
hidden_act=decoder_hidden_act, | |
intermediate_size=decoder_intermediate_size, | |
intermediate_kernel_size=decoder_intermediate_kernel_size, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
initializer_range=initializer_range, | |
hidden_dropout_prob=hidden_dropout_prob, | |
attention_probs_dropout_prob=attention_probs_dropout_prob, | |
layer_norm_eps=layer_norm_eps, | |
max_position_embeddings=max_position_embeddings, | |
) | |
self.duration_predictor_dropout_probs = duration_predictor_dropout_probs | |
self.num_duration_conv_layers = num_duration_conv_layers | |
self.duration_predictor_filters = duration_predictor_filters | |
self.duration_predictor_kernel_sizes = duration_predictor_kernel_sizes | |
self.num_mels = num_mels | |
# postnet | |
self.n_conv_postnet = n_conv_postnet | |
self.postnet_conv_filters = postnet_conv_filters | |
self.postnet_conv_kernel_sizes = postnet_conv_kernel_sizes | |
self.postnet_dropout_rate = postnet_dropout_rate | |