npark commited on
Commit
f14c1f3
·
1 Parent(s): fea0cd0

KdialectSpeech base model

Browse files
Files changed (4) hide show
  1. asr.ckpt +3 -0
  2. hyperparams.yaml +135 -0
  3. normalizer.ckpt +3 -0
  4. tokenizer.ckpt +3 -0
asr.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cadb724531f7f8b0da573d24746180d88183e96e794da96967ed8d045726ea03
3
+ size 183455481
hyperparams.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Feature parameters
2
+ sample_rate: 16000
3
+ n_fft: 400
4
+ n_mels: 80
5
+
6
+ ####################### Model parameters ###########################
7
+ # Transformer
8
+ d_model: 256
9
+ nhead: 4
10
+ num_encoder_layers: 12
11
+ num_decoder_layers: 6
12
+ d_ffn: 2048
13
+ transformer_dropout: 0.0
14
+ activation: !name:torch.nn.GELU
15
+ output_neurons: 5000
16
+ vocab_size: 5000
17
+
18
+ # Outputs
19
+ blank_index: 0
20
+ label_smoothing: 0.1
21
+ pad_index: 0
22
+ bos_index: 1
23
+ eos_index: 2
24
+ unk_index: 0
25
+
26
+ # Decoding parameters
27
+ min_decode_ratio: 0.0
28
+ max_decode_ratio: 1.0
29
+ valid_search_interval: 10 # 10
30
+ valid_beam_size: 10
31
+ test_beam_size: 60
32
+ lm_weight: 0.20
33
+ ctc_weight_decode: 0.40
34
+
35
+ ############################## asr models ################################
36
+ normalizer: !new:speechbrain.processing.features.InputNormalization
37
+ norm_type: global
38
+ #####
39
+
40
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
41
+ input_shape: (8, 10, 80)
42
+ num_blocks: 2
43
+ num_layers_per_block: 1
44
+ out_channels: (64, 32)
45
+ kernel_sizes: (3, 3)
46
+ strides: (2, 2)
47
+ residuals: (False, False)
48
+
49
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
50
+ input_size: 640
51
+ tgt_vocab: !ref <output_neurons>
52
+ d_model: !ref <d_model>
53
+ nhead: !ref <nhead>
54
+ num_encoder_layers: !ref <num_encoder_layers>
55
+ num_decoder_layers: !ref <num_decoder_layers>
56
+ d_ffn: !ref <d_ffn>
57
+ dropout: !ref <transformer_dropout>
58
+ activation: !ref <activation>
59
+ encoder_module: conformer
60
+ attention_type: RelPosMHAXL
61
+ normalize_before: True
62
+ causal: False
63
+
64
+ ### lm_model ###
65
+ ################
66
+
67
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
68
+
69
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
70
+ input_size: !ref <d_model>
71
+ n_neurons: !ref <output_neurons>
72
+
73
+ seq_lin: !new:speechbrain.nnet.linear.Linear
74
+ input_size: !ref <d_model>
75
+ n_neurons: !ref <output_neurons>
76
+
77
+ # decoder
78
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
79
+ modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
80
+ bos_index: !ref <bos_index>
81
+ eos_index: !ref <eos_index>
82
+ blank_index: !ref <blank_index>
83
+ min_decode_ratio: !ref <min_decode_ratio>
84
+ max_decode_ratio: !ref <max_decode_ratio>
85
+ beam_size: !ref <valid_beam_size>
86
+ ctc_weight: !ref <ctc_weight_decode>
87
+ using_eos_threshold: False
88
+ length_normalization: False
89
+
90
+ # encoder
91
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
92
+ transformer: !ref <Transformer>
93
+
94
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
95
+ input_shape: [null, null, !ref <n_mels>]
96
+ compute_features: !ref <compute_features>
97
+ normalize: !ref <normalizer>
98
+ cnn: !ref <CNN>
99
+ transformer_encoder: !ref <Tencoder>
100
+ # transformer: !ref <Transformer>
101
+
102
+ asr_model: !new:torch.nn.ModuleList
103
+ - [!ref <normalizer>, !ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
104
+
105
+ log_softmax: !new:torch.nn.LogSoftmax
106
+ dim: -1
107
+
108
+
109
+ compute_features: !new:speechbrain.lobes.features.Fbank
110
+ sample_rate: !ref <sample_rate>
111
+ n_fft: !ref <n_fft>
112
+ n_mels: !ref <n_mels>
113
+
114
+ # modules:
115
+ # encoder: !ref <encoder>
116
+ # decoder: !ref <decoder>
117
+
118
+ modules:
119
+ compute_features: !ref <compute_features>
120
+ normalizer: !ref <normalizer>
121
+ pre_transformer: !ref <CNN>
122
+ transformer: !ref <Transformer>
123
+ asr_model: !ref <asr_model>
124
+ # lm_model: !ref <lm_model>
125
+ encoder: !ref <encoder>
126
+ decoder: !ref <decoder>
127
+
128
+ # pretrainer
129
+
130
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
131
+ loadables:
132
+ normalizer: !ref <normalizer>
133
+ asr: !ref <asr_model>
134
+ # lm: !ref <lm_model>
135
+ tokenizer: !ref <tokenizer>
normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea59ba1d1045b68e532c7204ce38125b634aa894e59ca6a124bb0f2f555b84e5
3
+ size 1779
tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:906d5713805eb68d7b87c751cba50857cba4e509e3951920215c88790ef4cc48
3
+ size 312562