|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seed: 1234 |
|
__set_seed: !apply:torch.manual_seed [1234] |
|
|
|
|
|
|
|
char_tokenize: false |
|
char_token_type: unigram |
|
char_token_output: 512 |
|
char_token_wordwise: true |
|
phn_tokenize: false |
|
phn_token_type: unigram |
|
phn_token_output: 512 |
|
phn_token_wordwise: true |
|
character_coverage: 1.0 |
|
|
|
|
|
phonemes_count: 43 |
|
graphemes_count: 31 |
|
phonemes_enable_space: true |
|
|
|
|
|
lexicon_epochs: 50 |
|
lexicon_ctc_epochs: 10 |
|
lexicon_limit_to_stop: 50 |
|
lexicon_limit_warmup: 50 |
|
sentence_epochs: 13 |
|
sentence_ctc_epochs: 10 |
|
sentence_limit_to_stop: 3 |
|
sentence_limit_warmup: 3 |
|
homograph_epochs: 50 |
|
homograph_ctc_epochs: 10 |
|
homograph_limit_to_stop: 5 |
|
homograph_limit_warmup: 10 |
|
lexicon_batch_size: 1024 |
|
sentence_batch_size: 32 |
|
homograph_batch_size: 32 |
|
ctc_weight: 0.5 |
|
ctc_window_size: 0 |
|
homograph_loss_weight: 2.0 |
|
lr: 0.002 |
|
save_for_pretrained: true |
|
|
|
|
|
output_neurons: &id004 !apply:speechbrain.utils.hparams.choice |
|
|
|
value: false |
|
choices: |
|
true: 513 |
|
false: 43 |
|
|
|
enc_num_embeddings: &id005 !apply:speechbrain.utils.hparams.choice |
|
value: false |
|
choices: |
|
true: 513 |
|
false: 31 |
|
|
|
enc_dropout: 0.5 |
|
enc_neurons: 512 |
|
enc_num_layers: 4 |
|
dec_dropout: 0.5 |
|
dec_neurons: 512 |
|
dec_att_neurons: 256 |
|
dec_num_layers: 4 |
|
embedding_dim: 512 |
|
|
|
|
|
|
|
|
|
|
|
|
|
grapheme_sequence_mode: bos |
|
phoneme_sequence_mode: bos |
|
|
|
|
|
|
|
bos_index: 0 |
|
eos_index: 1 |
|
blank_index: 2 |
|
unk_index: 2 |
|
token_space_index: 512 |
|
|
|
|
|
|
|
lm_emb_dim: 256 |
|
lm_rnn_size: 512 |
|
lm_layers: 2 |
|
lm_output_neurons: 43 |
|
|
|
|
|
beam_search_min_decode_ratio: 0 |
|
beam_search_max_decode_ratio: 1.0 |
|
beam_search_beam_size: 16 |
|
beam_search_beam_size_valid: 16 |
|
beam_search_eos_threshold: 10.0 |
|
beam_search_using_max_attn_shift: false |
|
beam_search_max_attn_shift: 10 |
|
beam_search_coverage_penalty: 5.0 |
|
beam_search_lm_weight: 0.5 |
|
beam_search_ctc_weight_decode: 0.4 |
|
beam_search_temperature: 1.25 |
|
beam_search_temperature_lm: 1.0 |
|
|
|
|
|
use_word_emb: true |
|
word_emb_model: bert-base-uncased |
|
word_emb_dim: 768 |
|
word_emb_enc_dim: 256 |
|
word_emb_norm_type: batch |
|
|
|
graphemes: &id028 |
|
- A |
|
- B |
|
- C |
|
- D |
|
- E |
|
- F |
|
- G |
|
- H |
|
- I |
|
- J |
|
- K |
|
- L |
|
- M |
|
- N |
|
- O |
|
- P |
|
- Q |
|
- R |
|
- S |
|
- T |
|
- U |
|
- V |
|
- W |
|
- X |
|
- Y |
|
- Z |
|
- "'" |
|
- ' ' |
|
|
|
phonemes: &id001 |
|
|
|
|
|
- AA |
|
- AE |
|
- AH |
|
- AO |
|
- AW |
|
- AY |
|
- B |
|
- CH |
|
- D |
|
- DH |
|
- EH |
|
- ER |
|
- EY |
|
- F |
|
- G |
|
- HH |
|
- IH |
|
- IY |
|
- JH |
|
- K |
|
- L |
|
- M |
|
- N |
|
- NG |
|
- OW |
|
- OY |
|
- P |
|
- R |
|
- S |
|
- SH |
|
- T |
|
- TH |
|
- UH |
|
- UW |
|
- V |
|
- W |
|
- Y |
|
- Z |
|
- ZH |
|
- ' ' |
|
|
|
enc_input_dim: &id003 !apply:speechbrain.lobes.models.g2p.model.input_dim |
|
use_word_emb: true |
|
word_emb_enc_dim: 256 |
|
embedding_dim: 512 |
|
|
|
|
|
phn_char_map: &id002 !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map |
|
|
|
|
|
|
|
tokens: *id001 |
|
char_phn_map: &id023 !apply:speechbrain.lobes.models.g2p.dataio.flip_map |
|
map_dict: *id002 |
|
enc: &id006 !new:speechbrain.nnet.RNN.LSTM |
|
input_shape: [null, null, *id003] |
|
bidirectional: true |
|
hidden_size: 512 |
|
num_layers: 4 |
|
dropout: 0.5 |
|
|
|
lin: &id010 !new:speechbrain.nnet.linear.Linear |
|
input_size: 512 |
|
n_neurons: *id004 |
|
bias: false |
|
|
|
ctc_lin: &id013 !new:speechbrain.nnet.linear.Linear |
|
input_size: 1024 |
|
n_neurons: *id004 |
|
encoder_emb: &id007 !new:speechbrain.nnet.embedding.Embedding |
|
num_embeddings: *id005 |
|
embedding_dim: 512 |
|
|
|
emb: &id008 !new:speechbrain.nnet.embedding.Embedding |
|
num_embeddings: *id004 |
|
embedding_dim: 512 |
|
|
|
dec: &id009 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder |
|
enc_dim: 1024 |
|
input_size: 512 |
|
rnn_type: gru |
|
attn_type: content |
|
dropout: 0.5 |
|
hidden_size: 512 |
|
attn_dim: 256 |
|
num_layers: 4 |
|
|
|
word_emb_enc: &id012 !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder |
|
|
|
word_emb_dim: 768 |
|
word_emb_enc_dim: 256 |
|
norm_type: batch |
|
|
|
word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init |
|
init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings |
|
model: bert-base-uncased |
|
|
|
log_softmax: &id011 !new:speechbrain.nnet.activations.Softmax |
|
apply_log: true |
|
|
|
modules: |
|
model: &id014 !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq |
|
enc: *id006 |
|
encoder_emb: *id007 |
|
emb: *id008 |
|
dec: *id009 |
|
lin: *id010 |
|
out: *id011 |
|
use_word_emb: true |
|
word_emb_enc: *id012 |
|
enc: *id006 |
|
encoder_emb: *id007 |
|
emb: *id008 |
|
dec: *id009 |
|
lin: *id010 |
|
ctc_lin: *id013 |
|
out: *id011 |
|
word_emb: !ref <word_emb> |
|
word_emb_enc: *id012 |
|
model: *id014 |
|
lm_model: &id015 !new:speechbrain.lobes.models.RNNLM.RNNLM |
|
embedding_dim: 256 |
|
rnn_layers: 2 |
|
rnn_neurons: 512 |
|
output_neurons: 43 |
|
return_hidden: true |
|
|
|
opt_class: !name:torch.optim.Adam |
|
lr: 0.002 |
|
|
|
ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer |
|
eos_index: !ref <eos_index> |
|
blank_index: !ref <blank_index> |
|
ctc_fc: !ref <ctc_lin> |
|
ctc_window_size: !ref <ctc_window_size> |
|
|
|
coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer |
|
vocab_size: !ref <output_neurons> |
|
|
|
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder |
|
full_scorers: [!ref <coverage_scorer>, !ref <ctc_scorer>] |
|
weights: |
|
coverage: !ref <beam_search_coverage_penalty> |
|
ctc: !ref <ctc_weight> |
|
|
|
beam_searcher: &id029 !new:speechbrain.decoders.S2SRNNBeamSearcher |
|
embedding: *id008 |
|
decoder: *id009 |
|
linear: *id010 |
|
bos_index: !ref <bos_index> |
|
eos_index: !ref <eos_index> |
|
min_decode_ratio: 0 |
|
max_decode_ratio: 1.0 |
|
beam_size: 16 |
|
eos_threshold: 10.0 |
|
using_max_attn_shift: false |
|
max_attn_shift: 10 |
|
scorer: !ref <scorer> |
|
|
|
beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher |
|
embedding: *id008 |
|
decoder: *id009 |
|
linear: *id010 |
|
bos_index: !ref <bos_index> |
|
eos_index: !ref <eos_index> |
|
min_decode_ratio: 0 |
|
max_decode_ratio: 1.0 |
|
beam_size: 16 |
|
eos_threshold: 10.0 |
|
using_max_attn_shift: false |
|
max_attn_shift: 10 |
|
scorer: !ref <scorer> |
|
|
|
lr_annealing: &id018 !new:speechbrain.nnet.schedulers.NewBobScheduler |
|
initial_value: 0.002 |
|
improvement_threshold: 0.0 |
|
annealing_factor: 0.8 |
|
patient: 0 |
|
|
|
homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor |
|
|
|
seq_cost: &id016 !name:speechbrain.nnet.losses.nll_loss |
|
|
|
label_smoothing: 0.1 |
|
|
|
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss |
|
blank_index: 2 |
|
|
|
seq_cost_metric: &id017 !name:speechbrain.nnet.losses.nll_loss |
|
|
|
label_smoothing: 0.1 |
|
reduction: batch |
|
|
|
homograph_cost: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceLoss |
|
seq_cost: *id016 |
|
seq_stats: !name:speechbrain.utils.metric_stats.MetricStats |
|
metric: *id017 |
|
seq_stats_homograph: !name:speechbrain.utils.metric_stats.MetricStats |
|
metric: *id017 |
|
classification_stats_homograph: !name:speechbrain.utils.metric_stats.ClassificationStats |
|
|
|
per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats |
|
per_stats_homograph: !name:speechbrain.utils.metric_stats.ErrorRateStats |
|
|
|
|
|
model_output_keys: |
|
- p_seq |
|
- char_lens |
|
- encoder_out |
|
|
|
grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder |
|
phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder |
|
|
|
|
|
grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init |
|
init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece |
|
model_dir: grapheme_tokenizer |
|
bos_id: 0 |
|
eos_id: 1 |
|
unk_id: 2 |
|
vocab_size: 512 |
|
annotation_train: tokenizer_annotation_train.json |
|
annotation_read: char |
|
model_type: unigram |
|
character_coverage: 1.0 |
|
annotation_format: json |
|
text_file: grapheme_annotations.txt |
|
|
|
phoneme_tokenizer: &id022 !apply:speechbrain.lobes.models.g2p.dataio.lazy_init |
|
init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece |
|
model_dir: phoneme_tokenizer |
|
bos_id: 0 |
|
eos_id: 1 |
|
unk_id: 2 |
|
vocab_size: 512 |
|
annotation_train: tokenizer_annotation_train.json |
|
annotation_read: phn |
|
model_type: unigram |
|
character_coverage: 1.0 |
|
annotation_list_to_check: [tokenizer_annotation_valid.json] |
|
annotation_format: json |
|
text_file: phoneme_annotations.txt |
|
|
|
out_phoneme_decoder_tok: &id025 !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize |
|
tokenizer: *id022 |
|
char_map: *id023 |
|
token_space_index: 512 |
|
wordwise: true |
|
|
|
out_phoneme_decoder_raw: &id026 !name:speechbrain.lobes.models.g2p.dataio.text_decode |
|
|
|
encoder: *id024 |
|
out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice |
|
value: false |
|
choices: |
|
true: *id025 |
|
false: *id026 |
|
encode_pipeline: |
|
batch: false |
|
use_padded_data: true |
|
output_keys: |
|
- grapheme_list |
|
- grapheme_encoded_list |
|
- grapheme_encoded |
|
- word_emb |
|
init: |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos |
|
encoder: *id027 |
|
tokens: *id028 |
|
bos_index: 0 |
|
eos_index: 1 |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos |
|
encoder: *id024 |
|
tokens: *id001 |
|
bos_index: 0 |
|
eos_index: 1 |
|
steps: |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline |
|
graphemes: *id028 |
|
takes: txt |
|
provides: txt_cleaned |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline |
|
grapheme_encoder: *id027 |
|
takes: txt_cleaned |
|
provides: |
|
- grapheme_list |
|
- grapheme_encoded_list |
|
- grapheme_encoded_raw |
|
|
|
- func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos |
|
encoder: *id027 |
|
takes: grapheme_encoded_list |
|
provides: |
|
- grapheme_encoded |
|
- grapheme_len |
|
- grapheme_encoded_eos |
|
- grapheme_len_eos |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline |
|
word_emb: !ref <word_emb> |
|
grapheme_encoder: !ref <grapheme_encoder> |
|
use_word_emb: !ref <use_word_emb> |
|
takes: |
|
- txt |
|
- grapheme_encoded |
|
- grapheme_len |
|
provides: word_emb |
|
|
|
decode_pipeline: |
|
batch: true |
|
output_keys: |
|
- phonemes |
|
steps: |
|
- func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline |
|
beam_searcher: *id029 |
|
takes: |
|
- char_lens |
|
- encoder_out |
|
provides: |
|
- hyps |
|
- scores |
|
- func: !apply:speechbrain.utils.hparams.choice |
|
value: false |
|
choices: |
|
true: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize |
|
tokenizer: *id022 |
|
char_map: *id023 |
|
token_space_index: 512 |
|
wordwise: true |
|
false: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline |
|
phoneme_encoder: *id024 |
|
takes: |
|
- hyps |
|
provides: |
|
- phonemes |
|
|
|
|
|
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
|
loadables: |
|
model: *id014 |
|
ctc_lin: *id013 |
|
|
|
|