Adel-Moumen commited on
Commit
e2eb540
1 Parent(s): 2d6235d

Create hyperparams_develop.yaml

Browse files
Files changed (1) hide show
  1. hyperparams_develop.yaml +161 -0
hyperparams_develop.yaml ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: E2E ASR with Transformer
3
+ # Encoder: Transformer Encoder
4
+ # Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch + TransformerLM
5
+ # Tokens: unigram
6
+ # losses: CTC + KLdiv (Label Smoothing loss)
7
+ # Training: Librispeech 960h
8
+ # Authors: Jianyuan Zhong, Titouan Parcollet 2021
9
+ # ############################################################################
10
+
11
+ # Feature parameters
12
+ sample_rate: 16000
13
+ n_fft: 512
14
+ n_mels: 80
15
+
16
+ ####################### Model parameters ###########################
17
+ # Transformer
18
+ d_model: 512
19
+ nhead: 8
20
+ num_encoder_layers: 12
21
+ num_decoder_layers: 6
22
+ d_ffn: 2048
23
+ transformer_dropout: 0.1
24
+ activation: !name:torch.nn.GELU
25
+ output_neurons: 5000
26
+
27
+ # Outputs
28
+ blank_index: 0
29
+ label_smoothing: 0.1
30
+ pad_index: 0
31
+ bos_index: 1
32
+ eos_index: 2
33
+
34
+ # Decoding parameters
35
+ min_decode_ratio: 0.0
36
+ max_decode_ratio: 1.0
37
+ test_beam_size: 66
38
+ lm_weight: 0.60
39
+ ctc_weight_decode: 0.40
40
+
41
+ ############################## models ################################
42
+
43
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
44
+ input_shape: (8, 10, 80)
45
+ num_blocks: 2
46
+ num_layers_per_block: 1
47
+ out_channels: (64, 32)
48
+ kernel_sizes: (3, 3)
49
+ strides: (2, 2)
50
+ residuals: (False, False)
51
+
52
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
53
+ input_size: 640
54
+ tgt_vocab: !ref <output_neurons>
55
+ d_model: !ref <d_model>
56
+ nhead: !ref <nhead>
57
+ num_encoder_layers: !ref <num_encoder_layers>
58
+ num_decoder_layers: !ref <num_decoder_layers>
59
+ d_ffn: !ref <d_ffn>
60
+ dropout: !ref <transformer_dropout>
61
+ activation: !ref <activation>
62
+ encoder_module: conformer
63
+ attention_type: RelPosMHAXL
64
+ normalize_before: True
65
+ causal: False
66
+
67
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
68
+ input_size: !ref <d_model>
69
+ n_neurons: !ref <output_neurons>
70
+
71
+ seq_lin: !new:speechbrain.nnet.linear.Linear
72
+ input_size: !ref <d_model>
73
+ n_neurons: !ref <output_neurons>
74
+
75
+ transformerlm_scorer: !new:speechbrain.decoders.scorer.TransformerLMScorer
76
+ language_model: !ref <lm_model>
77
+ temperature: 1.15
78
+
79
+ ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
80
+ eos_index: !ref <eos_index>
81
+ blank_index: !ref <blank_index>
82
+ ctc_fc: !ref <ctc_lin>
83
+
84
+ scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
85
+ full_scorers: [!ref <transformerlm_scorer>, !ref <ctc_scorer>]
86
+ weights:
87
+ transformerlm: !ref <lm_weight>
88
+ ctc: !ref <ctc_weight_decode>
89
+
90
+
91
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
92
+ modules: [!ref <Transformer>, !ref <seq_lin>]
93
+ bos_index: !ref <bos_index>
94
+ eos_index: !ref <eos_index>
95
+ min_decode_ratio: !ref <min_decode_ratio>
96
+ max_decode_ratio: !ref <max_decode_ratio>
97
+ beam_size: !ref <test_beam_size>
98
+ temperature: 1.15
99
+ using_eos_threshold: False
100
+ length_normalization: True
101
+
102
+ log_softmax: !new:torch.nn.LogSoftmax
103
+ dim: -1
104
+
105
+ normalizer: !new:speechbrain.processing.features.InputNormalization
106
+ norm_type: global
107
+
108
+ compute_features: !new:speechbrain.lobes.features.Fbank
109
+ sample_rate: !ref <sample_rate>
110
+ n_fft: !ref <n_fft>
111
+ n_mels: !ref <n_mels>
112
+
113
+ # This is the Transformer LM that is used according to the Huggingface repository
114
+ # Visit the HuggingFace model corresponding to the pretrained_lm_tokenizer_path
115
+ # For more details about the model!
116
+ # NB: It has to match the pre-trained TransformerLM!!
117
+ lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM
118
+ vocab: 5000
119
+ d_model: 768
120
+ nhead: 12
121
+ num_encoder_layers: 12
122
+ num_decoder_layers: 0
123
+ d_ffn: 3072
124
+ dropout: 0.0
125
+ activation: !name:torch.nn.GELU
126
+ normalize_before: False
127
+
128
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
129
+
130
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
131
+ transformer: !ref <Transformer>
132
+
133
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
134
+ input_shape: [null, null, !ref <n_mels>]
135
+ compute_features: !ref <compute_features>
136
+ normalize: !ref <normalizer>
137
+ cnn: !ref <CNN>
138
+ transformer_encoder: !ref <Tencoder>
139
+
140
+ # Models
141
+ asr_model: !new:torch.nn.ModuleList
142
+ - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
143
+
144
+ modules:
145
+ compute_features: !ref <compute_features>
146
+ normalizer: !ref <normalizer>
147
+ pre_transformer: !ref <CNN>
148
+ transformer: !ref <Transformer>
149
+ asr_model: !ref <asr_model>
150
+ lm_model: !ref <lm_model>
151
+ encoder: !ref <encoder>
152
+ decoder: !ref <decoder>
153
+
154
+ # The pretrainer allows a mapping between pretrained files and instances that
155
+ # are declared in the yaml.
156
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
157
+ loadables:
158
+ normalizer: !ref <normalizer>
159
+ asr: !ref <asr_model>
160
+ lm: !ref <lm_model>
161
+ tokenizer: !ref <tokenizer>