|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seed: 1234 |
|
__set_seed: !apply:torch.manual_seed [!ref <seed>] |
|
|
|
|
|
|
|
char_tokenize: False |
|
char_token_type: unigram |
|
char_token_output: 512 |
|
char_token_wordwise: True |
|
phn_tokenize: False |
|
phn_token_type: unigram |
|
phn_token_output: 512 |
|
phn_token_wordwise: True |
|
character_coverage: 1.0 |
|
|
|
|
|
phonemes_count: 43 |
|
graphemes_count: 31 |
|
phonemes_enable_space: True |
|
|
|
ctc_weight: 0.5 |
|
ctc_window_size: 0 |
|
homograph_loss_weight: 2.0 |
|
|
|
|
|
output_neurons: !apply:speechbrain.utils.hparams.choice |
|
value: !ref <phn_tokenize> |
|
choices: |
|
True: !ref <phn_token_output> + 1 |
|
False: !ref <phonemes_count> |
|
|
|
enc_num_embeddings: !apply:speechbrain.utils.hparams.choice |
|
value: !ref <char_tokenize> |
|
choices: |
|
True: !ref <char_token_output> + 1 |
|
False: !ref <graphemes_count> |
|
|
|
enc_dropout: 0.5 |
|
enc_neurons: 512 |
|
enc_num_layers: 4 |
|
dec_dropout: 0.5 |
|
dec_neurons: 512 |
|
dec_att_neurons: 256 |
|
dec_num_layers: 4 |
|
embedding_dim: 512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
grapheme_sequence_mode: bos |
|
phoneme_sequence_mode: bos |
|
|
|
|
|
|
|
bos_index: 0 |
|
eos_index: 1 |
|
blank_index: 2 |
|
unk_index: 2 |
|
token_space_index: 512 |
|
|
|
|
|
|
|
lm_emb_dim: 256 |
|
lm_rnn_size: 512 |
|
lm_layers: 2 |
|
lm_output_neurons: 43 |
|
|
|
|
|
beam_search_min_decode_ratio: 0 |
|
beam_search_max_decode_ratio: 1.0 |
|
beam_search_beam_size: 16 |
|
beam_search_beam_size_valid: 16 |
|
beam_search_eos_threshold: 10.0 |
|
beam_search_using_max_attn_shift: false |
|
beam_search_max_attn_shift: 10 |
|
beam_search_coverage_penalty: 5.0 |
|
beam_search_lm_weight: 0.5 |
|
beam_search_ctc_weight_decode: 0.4 |
|
beam_search_temperature: 1.25 |
|
beam_search_temperature_lm: 1.0 |
|
|
|
|
|
use_word_emb: true |
|
word_emb_model: bert-base-uncased |
|
word_emb_dim: 768 |
|
word_emb_enc_dim: 256 |
|
word_emb_norm_type: batch |
|
|
|
graphemes: |
|
- A |
|
- B |
|
- C |
|
- D |
|
- E |
|
- F |
|
- G |
|
- H |
|
- I |
|
- J |
|
- K |
|
- L |
|
- M |
|
- N |
|
- O |
|
- P |
|
- Q |
|
- R |
|
- S |
|
- T |
|
- U |
|
- V |
|
- W |
|
- X |
|
- Y |
|
- Z |
|
- "'" |
|
- ' ' |
|
|
|
phonemes: |
|
- AA |
|
- AE |
|
- AH |
|
- AO |
|
- AW |
|
- AY |
|
- B |
|
- CH |
|
- D |
|
- DH |
|
- EH |
|
- ER |
|
- EY |
|
- F |
|
- G |
|
- HH |
|
- IH |
|
- IY |
|
- JH |
|
- K |
|
- L |
|
- M |
|
- N |
|
- NG |
|
- OW |
|
- OY |
|
- P |
|
- R |
|
- S |
|
- SH |
|
- T |
|
- TH |
|
- UH |
|
- UW |
|
- V |
|
- W |
|
- Y |
|
- Z |
|
- ZH |
|
- ' ' |
|
|
|
enc_input_dim: !apply:speechbrain.lobes.models.g2p.model.input_dim |
|
use_word_emb: !ref <use_word_emb> |
|
word_emb_enc_dim: !ref <word_emb_enc_dim> |
|
embedding_dim: !ref <embedding_dim> |
|
|
|
phn_char_map: !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map |
|
tokens: !ref <phonemes> |
|
|
|
char_phn_map: !apply:speechbrain.lobes.models.g2p.dataio.flip_map |
|
map_dict: !ref <phn_char_map> |
|
|
|
enc: !new:speechbrain.nnet.RNN.LSTM |
|
input_shape: [null, null, !ref <enc_input_dim>] |
|
bidirectional: True |
|
hidden_size: !ref <enc_neurons> |
|
num_layers: !ref <enc_num_layers> |
|
dropout: !ref <enc_dropout> |
|
|
|
lin: !new:speechbrain.nnet.linear.Linear |
|
input_size: !ref <dec_neurons> |
|
n_neurons: !ref <output_neurons> |
|
bias: false |
|
|
|
ctc_lin: !new:speechbrain.nnet.linear.Linear |
|
input_size: !ref 2 * <enc_neurons> |
|
n_neurons: !ref <output_neurons> |
|
|
|
encoder_emb: !new:speechbrain.nnet.embedding.Embedding |
|
num_embeddings: !ref <enc_num_embeddings> |
|
embedding_dim: !ref <embedding_dim> |
|
|
|
emb: !new:speechbrain.nnet.embedding.Embedding |
|
num_embeddings: !ref <output_neurons> |
|
embedding_dim: !ref <embedding_dim> |
|
|
|
dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder |
|
enc_dim: !ref <enc_neurons> * 2 |
|
input_size: !ref <embedding_dim> |
|
rnn_type: gru |
|
attn_type: content |
|
dropout: !ref <dec_dropout> |
|
hidden_size: !ref <dec_neurons> |
|
attn_dim: !ref <dec_att_neurons> |
|
num_layers: !ref <dec_num_layers> |
|
|
|
word_emb_enc: !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder |
|
word_emb_dim: !ref <word_emb_dim> |
|
word_emb_enc_dim: !ref <word_emb_enc_dim> |
|
norm_type: batch |
|
|
|
word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init |
|
init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings |
|
model: bert-base-uncased |
|
|
|
log_softmax: !new:speechbrain.nnet.activations.Softmax |
|
apply_log: true |
|
|
|
model: !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq |
|
enc: !ref <enc> |
|
encoder_emb: !ref <encoder_emb> |
|
emb: !ref <emb> |
|
dec: !ref <dec> |
|
lin: !ref <lin> |
|
out: !ref <log_softmax> |
|
use_word_emb: !ref <use_word_emb> |
|
word_emb_enc: !ref <word_emb_enc> |
|
|
|
modules: |
|
model: !ref <model> |
|
enc: !ref <enc> |
|
encoder_emb: !ref <encoder_emb> |
|
emb: !ref <emb> |
|
dec: !ref <dec> |
|
lin: !ref <lin> |
|
ctc_lin: !ref <ctc_lin> |
|
out: !ref <log_softmax> |
|
word_emb: !ref <word_emb> |
|
word_emb_enc: !ref <word_emb_enc> |
|
|
|
lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM |
|
embedding_dim: !ref <lm_emb_dim> |
|
rnn_layers: !ref <lm_layers> |
|
rnn_neurons: !ref <lm_rnn_size> |
|
output_neurons: !ref <lm_output_neurons> |
|
return_hidden: True |
|
|
|
ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer |
|
eos_index: !ref <eos_index> |
|
blank_index: !ref <blank_index> |
|
ctc_fc: !ref <ctc_lin> |
|
ctc_window_size: !ref <ctc_window_size> |
|
|
|
coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer |
|
vocab_size: !ref <output_neurons> |
|
|
|
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder |
|
full_scorers: [!ref <coverage_scorer>, !ref <ctc_scorer>] |
|
weights: |
|
coverage: !ref <beam_search_coverage_penalty> |
|
ctc: !ref <ctc_weight> |
|
|
|
beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher |
|
embedding: !ref <emb> |
|
decoder: !ref <dec> |
|
linear: !ref <lin> |
|
bos_index: !ref <bos_index> |
|
eos_index: !ref <eos_index> |
|
min_decode_ratio: !ref <beam_search_min_decode_ratio> |
|
max_decode_ratio: !ref <beam_search_max_decode_ratio> |
|
beam_size: !ref <beam_search_beam_size> |
|
eos_threshold: !ref <beam_search_eos_threshold> |
|
using_max_attn_shift: !ref <beam_search_using_max_attn_shift> |
|
max_attn_shift: !ref <beam_search_max_attn_shift> |
|
temperature: !ref <beam_search_temperature> |
|
scorer: !ref <scorer> |
|
|
|
beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher |
|
embedding: !ref <emb> |
|
decoder: !ref <dec> |
|
linear: !ref <lin> |
|
bos_index: !ref <bos_index> |
|
eos_index: !ref <eos_index> |
|
min_decode_ratio: !ref <beam_search_min_decode_ratio> |
|
max_decode_ratio: !ref <beam_search_max_decode_ratio> |
|
beam_size: !ref <beam_search_beam_size> |
|
eos_threshold: !ref <beam_search_eos_threshold> |
|
using_max_attn_shift: !ref <beam_search_using_max_attn_shift> |
|
max_attn_shift: !ref <beam_search_max_attn_shift> |
|
temperature: !ref <beam_search_temperature> |
|
scorer: !ref <scorer> |
|
|
|
homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor |
|
|
|
model_output_keys: |
|
- p_seq |
|
- char_lens |
|
- encoder_out |
|
|
|
grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder |
|
phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder |
|
|
|
|
|
grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init |
|
init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece |
|
model_dir: grapheme_tokenizer |
|
bos_id: !ref <bos_index> |
|
eos_id: !ref <eos_index> |
|
unk_id: !ref <unk_index> |
|
vocab_size: !ref <char_token_output> |
|
annotation_train: null |
|
annotation_read: char |
|
model_type: !ref <char_token_type> |
|
character_coverage: !ref <character_coverage> |
|
annotation_format: json |
|
text_file: grapheme_annotations.txt |
|
|
|
phoneme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init |
|
init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece |
|
model_dir: phoneme_tokenizer |
|
bos_id: !ref <bos_index> |
|
eos_id: !ref <eos_index> |
|
unk_id: !ref <unk_index> |
|
vocab_size: !ref <phn_token_output> |
|
annotation_train: null |
|
annotation_read: phn |
|
model_type: !ref <phn_token_type> |
|
character_coverage: !ref <character_coverage> |
|
annotation_format: json |
|
text_file: null |
|
|
|
out_phoneme_decoder_tok: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize |
|
tokenizer: !ref <phoneme_tokenizer> |
|
char_map: !ref <char_phn_map> |
|
token_space_index: !ref <token_space_index> |
|
wordwise: !ref <phn_token_wordwise> |
|
|
|
out_phoneme_decoder_raw: !name:speechbrain.lobes.models.g2p.dataio.text_decode |
|
encoder: !ref <phoneme_encoder> |
|
|
|
out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice |
|
value: false |
|
choices: |
|
True: !ref <out_phoneme_decoder_tok> |
|
False: !ref <out_phoneme_decoder_raw> |
|
encode_pipeline: |
|
batch: false |
|
use_padded_data: true |
|
output_keys: |
|
- grapheme_list |
|
- grapheme_encoded_list |
|
- grapheme_encoded |
|
- word_emb |
|
init: |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos |
|
encoder: !ref <grapheme_encoder> |
|
tokens: !ref <graphemes> |
|
bos_index: !ref <bos_index> |
|
eos_index: !ref <eos_index> |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos |
|
encoder: !ref <phoneme_encoder> |
|
tokens: !ref <phonemes> |
|
bos_index: !ref <bos_index> |
|
eos_index: !ref <eos_index> |
|
steps: |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline |
|
graphemes: !ref <graphemes> |
|
takes: txt |
|
provides: txt_cleaned |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline |
|
grapheme_encoder: !ref <grapheme_encoder> |
|
takes: txt_cleaned |
|
provides: |
|
- grapheme_list |
|
- grapheme_encoded_list |
|
- grapheme_encoded_raw |
|
|
|
- func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos |
|
encoder: !ref <grapheme_encoder> |
|
takes: grapheme_encoded_list |
|
provides: |
|
- grapheme_encoded |
|
- grapheme_len |
|
- grapheme_encoded_eos |
|
- grapheme_len_eos |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline |
|
word_emb: !ref <word_emb> |
|
grapheme_encoder: !ref <grapheme_encoder> |
|
use_word_emb: !ref <use_word_emb> |
|
takes: |
|
- txt |
|
- grapheme_encoded |
|
- grapheme_len |
|
provides: word_emb |
|
|
|
decode_pipeline: |
|
batch: true |
|
output_keys: |
|
- phonemes |
|
steps: |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline |
|
beam_searcher: !ref <beam_searcher> |
|
takes: |
|
- char_lens |
|
- encoder_out |
|
provides: |
|
- hyps |
|
- scores |
|
- func: !apply:speechbrain.utils.hparams.choice |
|
value: false |
|
choices: |
|
True: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize |
|
tokenizer: !ref <phoneme_tokenizer> |
|
char_map: !ref <char_phn_map> |
|
token_space_index: !ref <token_space_index> |
|
wordwise: !ref <phn_token_wordwise> |
|
False: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline |
|
phoneme_encoder: !ref <phoneme_encoder> |
|
takes: |
|
- hyps |
|
provides: |
|
- phonemes |
|
|
|
|
|
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
|
loadables: |
|
model: !ref <model> |
|
ctc_lin: !ref <ctc_lin> |
|
|