File size: 4,837 Bytes
041e8d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
# ############################################################################
# Model: E2E ASR with Transformer
# Encoder: Branchformer Encoder
# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch + TransformerLM
# Tokens: unigram
# losses: CTC + KLdiv (Label Smoothing loss)
# Training: KsponSpeech 965.2h
# Based on the works of: Adel Moumen 2023
# Authors: Dong Won Kim 2024
# ############################################################################
# Feature parameters
sample_rate: 16000
n_fft: 512
n_mels: 80
win_length: 32
####################### Model parameters ###########################
# Transformer
d_model: 256
nhead: 4
num_encoder_layers: 12
num_decoder_layers: 6
csgu_linear_units: 2048
csgu_kernel_size: 31
activation: !name:torch.nn.GELU
output_neurons: 5000
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 20
lm_weight: 0.25
ctc_weight_decode: 0.60
# Outputs
blank_index: 0
label_smoothing: 0.0
pad_index: 0
bos_index: 1
eos_index: 2
############################## models ################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 2
num_layers_per_block: 1
out_channels: (64, 32)
kernel_sizes: (3, 3)
strides: (2, 2)
residuals: (False, False)
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
input_size: 640
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
dropout: !ref <transformer_dropout>
activation: !ref <activation>
branchformer_activation: !ref <activation>
encoder_module: branchformer
csgu_linear_units: !ref <csgu_linear_units>
kernel_size: !ref <csgu_kernel_size>
attention_type: RelPosMHAXL
normalize_before: True
causal: False
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
transformerlm_scorer: !new:speechbrain.decoders.scorer.TransformerLMScorer
language_model: !ref <lm_model>
temperature: 1.30
ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
ctc_fc: !ref <ctc_lin>
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
full_scorers: [!ref <transformerlm_scorer>, !ref <ctc_scorer>]
weights:
transformerlm: !ref <lm_weight>
ctc: !ref <ctc_weight_decode>
decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
modules: [!ref <Transformer>, !ref <seq_lin>]
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <beam_size>
temperature: 1.30
using_eos_threshold: False
length_normalization: True
scorer: !ref <scorer>
log_softmax: !new:torch.nn.LogSoftmax
dim: -1
normalizer: !new:speechbrain.processing.features.InputNormalization
norm_type: global
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
win_length: !ref <win_length>
n_mels: !ref <n_mels>
lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM
vocab: 5000
d_model: 768
nhead: 12
num_encoder_layers: 12
num_decoder_layers: 0
d_ffn: 3072
dropout: 0.0
activation: !name:torch.nn.GELU
normalize_before: False
tokenizer: !new:sentencepiece.SentencePieceProcessor
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
transformer: !ref <Transformer>
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalizer>
cnn: !ref <CNN>
transformer_encoder: !ref <Tencoder>
# Models
asr_model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
modules:
compute_features: !ref <compute_features>
normalizer: !ref <normalizer>
pre_transformer: !ref <CNN>
transformer: !ref <Transformer>
asr_model: !ref <asr_model>
lm_model: !ref <lm_model>
encoder: !ref <encoder>
decoder: !ref <decoder>
# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
normalizer: !ref <normalizer>
asr: !ref <asr_model>
lm: !ref <lm_model>
tokenizer: !ref <tokenizer> |