ddwkim commited on
Commit
041e8d6
1 Parent(s): c0b9425

Create hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +162 -0
hyperparams.yaml ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: E2E ASR with Transformer
3
+ # Encoder: Branchformer Encoder
4
+ # Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch + TransformerLM
5
+ # Tokens: unigram
6
+ # losses: CTC + KLdiv (Label Smoothing loss)
7
+ # Training: KsponSpeech 965.2h
8
+ # Based on the works of: Adel Moumen 2023
9
+ # Authors: Dong Won Kim 2024
10
+ # ############################################################################
11
+
12
+ # Feature parameters
13
+ sample_rate: 16000
14
+ n_fft: 512
15
+ n_mels: 80
16
+ win_length: 32
17
+
18
+ ####################### Model parameters ###########################
19
+ # Transformer
20
+ d_model: 256
21
+ nhead: 4
22
+ num_encoder_layers: 12
23
+ num_decoder_layers: 6
24
+ csgu_linear_units: 2048
25
+ csgu_kernel_size: 31
26
+ activation: !name:torch.nn.GELU
27
+ output_neurons: 5000
28
+
29
+ # Decoding parameters
30
+ min_decode_ratio: 0.0
31
+ max_decode_ratio: 1.0
32
+ beam_size: 20
33
+ lm_weight: 0.25
34
+ ctc_weight_decode: 0.60
35
+
36
+ # Outputs
37
+ blank_index: 0
38
+ label_smoothing: 0.0
39
+ pad_index: 0
40
+ bos_index: 1
41
+ eos_index: 2
42
+
43
+ ############################## models ################################
44
+
45
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
46
+ input_shape: (8, 10, 80)
47
+ num_blocks: 2
48
+ num_layers_per_block: 1
49
+ out_channels: (64, 32)
50
+ kernel_sizes: (3, 3)
51
+ strides: (2, 2)
52
+ residuals: (False, False)
53
+
54
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
55
+ input_size: 640
56
+ tgt_vocab: !ref <output_neurons>
57
+ d_model: !ref <d_model>
58
+ nhead: !ref <nhead>
59
+ num_encoder_layers: !ref <num_encoder_layers>
60
+ num_decoder_layers: !ref <num_decoder_layers>
61
+ dropout: !ref <transformer_dropout>
62
+ activation: !ref <activation>
63
+ branchformer_activation: !ref <activation>
64
+ encoder_module: branchformer
65
+ csgu_linear_units: !ref <csgu_linear_units>
66
+ kernel_size: !ref <csgu_kernel_size>
67
+ attention_type: RelPosMHAXL
68
+ normalize_before: True
69
+ causal: False
70
+
71
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
72
+ input_size: !ref <d_model>
73
+ n_neurons: !ref <output_neurons>
74
+
75
+ seq_lin: !new:speechbrain.nnet.linear.Linear
76
+ input_size: !ref <d_model>
77
+ n_neurons: !ref <output_neurons>
78
+
79
+ transformerlm_scorer: !new:speechbrain.decoders.scorer.TransformerLMScorer
80
+ language_model: !ref <lm_model>
81
+ temperature: 1.30
82
+
83
+ ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
84
+ eos_index: !ref <eos_index>
85
+ blank_index: !ref <blank_index>
86
+ ctc_fc: !ref <ctc_lin>
87
+
88
+ scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
89
+ full_scorers: [!ref <transformerlm_scorer>, !ref <ctc_scorer>]
90
+ weights:
91
+ transformerlm: !ref <lm_weight>
92
+ ctc: !ref <ctc_weight_decode>
93
+
94
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
95
+ modules: [!ref <Transformer>, !ref <seq_lin>]
96
+ bos_index: !ref <bos_index>
97
+ eos_index: !ref <eos_index>
98
+ min_decode_ratio: !ref <min_decode_ratio>
99
+ max_decode_ratio: !ref <max_decode_ratio>
100
+ beam_size: !ref <beam_size>
101
+ temperature: 1.30
102
+ using_eos_threshold: False
103
+ length_normalization: True
104
+ scorer: !ref <scorer>
105
+
106
+ log_softmax: !new:torch.nn.LogSoftmax
107
+ dim: -1
108
+
109
+ normalizer: !new:speechbrain.processing.features.InputNormalization
110
+ norm_type: global
111
+
112
+ compute_features: !new:speechbrain.lobes.features.Fbank
113
+ sample_rate: !ref <sample_rate>
114
+ n_fft: !ref <n_fft>
115
+ win_length: !ref <win_length>
116
+ n_mels: !ref <n_mels>
117
+
118
+ lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM
119
+ vocab: 5000
120
+ d_model: 768
121
+ nhead: 12
122
+ num_encoder_layers: 12
123
+ num_decoder_layers: 0
124
+ d_ffn: 3072
125
+ dropout: 0.0
126
+ activation: !name:torch.nn.GELU
127
+ normalize_before: False
128
+
129
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
130
+
131
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
132
+ transformer: !ref <Transformer>
133
+
134
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
135
+ input_shape: [null, null, !ref <n_mels>]
136
+ compute_features: !ref <compute_features>
137
+ normalize: !ref <normalizer>
138
+ cnn: !ref <CNN>
139
+ transformer_encoder: !ref <Tencoder>
140
+
141
+ # Models
142
+ asr_model: !new:torch.nn.ModuleList
143
+ - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
144
+
145
+ modules:
146
+ compute_features: !ref <compute_features>
147
+ normalizer: !ref <normalizer>
148
+ pre_transformer: !ref <CNN>
149
+ transformer: !ref <Transformer>
150
+ asr_model: !ref <asr_model>
151
+ lm_model: !ref <lm_model>
152
+ encoder: !ref <encoder>
153
+ decoder: !ref <decoder>
154
+
155
+ # The pretrainer allows a mapping between pretrained files and instances that
156
+ # are declared in the yaml.
157
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
158
+ loadables:
159
+ normalizer: !ref <normalizer>
160
+ asr: !ref <asr_model>
161
+ lm: !ref <lm_model>
162
+ tokenizer: !ref <tokenizer>