tgritsaev commited on
Commit
affcd23
1 Parent(s): 574d505

Upload 198 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. automatic-speech-recognition/Dockerfile +15 -0
  2. automatic-speech-recognition/LICENSE +21 -0
  3. automatic-speech-recognition/README.md +21 -0
  4. automatic-speech-recognition/checkpoint.pth +3 -0
  5. automatic-speech-recognition/default_test_config.json +188 -0
  6. automatic-speech-recognition/default_test_model/config.json +242 -0
  7. automatic-speech-recognition/hw_asr/__init__.py +0 -0
  8. automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc +0 -0
  9. automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc +0 -0
  10. automatic-speech-recognition/hw_asr/augmentations/__init__.py +36 -0
  11. automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc +0 -0
  12. automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc +0 -0
  13. automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc +0 -0
  14. automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc +0 -0
  15. automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc +0 -0
  16. automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc +0 -0
  17. automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc +0 -0
  18. automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc +0 -0
  19. automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc +0 -0
  20. automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc +0 -0
  21. automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc +0 -0
  22. automatic-speech-recognition/hw_asr/augmentations/base.py +6 -0
  23. automatic-speech-recognition/hw_asr/augmentations/random_apply.py +16 -0
  24. automatic-speech-recognition/hw_asr/augmentations/random_choice.py +17 -0
  25. automatic-speech-recognition/hw_asr/augmentations/sequential.py +16 -0
  26. automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py +17 -0
  27. automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py +11 -0
  28. automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py +11 -0
  29. automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py +7 -0
  30. automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc +0 -0
  31. automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc +0 -0
  32. automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc +0 -0
  33. automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc +0 -0
  34. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py +13 -0
  35. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py +13 -0
  36. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py +13 -0
  37. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py +13 -0
  38. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py +13 -0
  39. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py +13 -0
  40. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py +13 -0
  41. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py +13 -0
  42. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py +19 -0
  43. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc +0 -0
  44. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc +0 -0
  45. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc +0 -0
  46. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc +0 -0
  47. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc +0 -0
  48. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc +0 -0
  49. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc +0 -0
  50. automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc +0 -0
automatic-speech-recognition/Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-devel
2
+ WORKDIR /repos/asr_project_template
3
+
4
+ # Install requirements for torchaudio
5
+ RUN pip install sox && conda install torchaudio==0.11.0 -c pytorch && conda install -c conda-forge librosa
6
+
7
+ # Install requirements
8
+ COPY requirements.txt ./
9
+ RUN pip install -r requirements.txt
10
+
11
+ # Copy the contents of repository
12
+ COPY . .
13
+
14
+ # Expose port
15
+ EXPOSE 3000
automatic-speech-recognition/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Daniil Ivanov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
automatic-speech-recognition/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ASR project barebones
2
+
3
+ ## Installation guide
4
+
5
+ 1. `pip install -r ./requirements.txt`
6
+ 2. Download from http://www.openslr.org/11/ `3-gram.arpa.gz` and `librispeech-vocab.txt`
7
+ 3. `python hw_asr/text_encoder/fix_vocab.py` and `python hw_asr/text_encoder/lower_model.py` to prepare vocab and model for using
8
+ 4. If you want to test my model, download it from the https://drive.google.com/file/d/1QrSsx56V5YNjGHUBWy6CIRVbNbjKWUpJ/view?usp=share_link , name it `checkpoint.pth` and place to the directory `default_test_model/`
9
+
10
+ ## Train
11
+
12
+ 1. `python train.py --config hw_asr/configs/config2.json`
13
+
14
+ ## Test
15
+
16
+ 1. `python test.py -c default_test_config.json -r default_test_model/checkpoint.pth`
17
+
18
+ ## Wandb report
19
+
20
+ 1. You can check my wandb report (only on Russian) and wandb project from the https://wandb.ai/tgritsaev/asr_project/reports/DLA-HW-1--Vmlldzo1NzY3NjA5?accessToken=kotkj5oyzomf2d2g1f40mczdnpirwvuw1f538zx9k491g1cfh3wg9iwhsb65o054
21
+
automatic-speech-recognition/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f1bebf1c95bb69c3130757b652c9fa4a975302b142a0b3d779d36ba404905ac
3
+ size 333205079
automatic-speech-recognition/default_test_config.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "default_test_config",
3
+ "n_gpu": 1,
4
+ "text_encoder": {
5
+ "type": "CTCCharTextEncoder",
6
+ "args": {
7
+ "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa",
8
+ "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt"
9
+ }
10
+ },
11
+ "preprocessing": {
12
+ "sr": 16000,
13
+ "spectrogram": {
14
+ "type": "MelSpectrogram",
15
+ "args": {
16
+ "n_mels": 256
17
+ }
18
+ },
19
+ "log_spec": true
20
+ },
21
+ "augmentations": {
22
+ "random_apply_p": 0.6,
23
+ "wave": [
24
+ {
25
+ "type": "AddColoredNoise",
26
+ "args": {
27
+ "p": 1,
28
+ "sample_rate": 16000
29
+ }
30
+ },
31
+ {
32
+ "type": "Gain",
33
+ "args": {
34
+ "p": 0.8,
35
+ "sample_rate": 16000
36
+ }
37
+ },
38
+ {
39
+ "type": "HighPassFilter",
40
+ "args": {
41
+ "p": 0,
42
+ "sample_rate": 16000
43
+ }
44
+ },
45
+ {
46
+ "type": "LowPassFilter",
47
+ "args": {
48
+ "p": 0,
49
+ "sample_rate": 16000
50
+ }
51
+ },
52
+ {
53
+ "type": "PitchShift",
54
+ "args": {
55
+ "p": 0.8,
56
+ "min_transpose_semitones": -2,
57
+ "max_transpose_semitones": 2,
58
+ "sample_rate": 16000
59
+ }
60
+ },
61
+ {
62
+ "type": "PolarityInversion",
63
+ "args": {
64
+ "p": 0.8,
65
+ "sample_rate": 16000
66
+ }
67
+ },
68
+ {
69
+ "type": "Shift",
70
+ "args": {
71
+ "p": 0.8,
72
+ "sample_rate": 16000
73
+ }
74
+ }
75
+ ],
76
+ "spectrogram": [
77
+ {
78
+ "type": "TimeMasking",
79
+ "args": {
80
+ "time_mask_param": 80,
81
+ "p": 0.05
82
+ }
83
+ },
84
+ {
85
+ "type": "FrequencyMasking",
86
+ "args": {
87
+ "freq_mask_param": 80
88
+ }
89
+ }
90
+ ]
91
+ },
92
+ "arch": {
93
+ "type": "DeepSpeech2Model",
94
+ "args": {
95
+ "n_feats": 256,
96
+ "n_rnn_layers": 6,
97
+ "rnn_hidden_size": 512,
98
+ "rnn_dropout": 0.2
99
+ }
100
+ },
101
+ "data": {
102
+ "test": {
103
+ "batch_size": 64,
104
+ "num_workers": 4,
105
+ "datasets": [
106
+ {
107
+ "type": "LibrispeechDataset",
108
+ "args": {
109
+ "part": "test-other"
110
+ }
111
+ }
112
+ ]
113
+ }
114
+ },
115
+ "optimizer": {
116
+ "type": "AdamW",
117
+ "args": {
118
+ "lr": 0.0003,
119
+ "weight_decay": 1e-05
120
+ }
121
+ },
122
+ "loss": {
123
+ "type": "CTCLoss",
124
+ "args": {}
125
+ },
126
+ "metrics": [
127
+ {
128
+ "type": "ArgmaxWERMetric",
129
+ "args": {
130
+ "name": "WER (argmax)"
131
+ }
132
+ },
133
+ {
134
+ "type": "ArgmaxCERMetric",
135
+ "args": {
136
+ "name": "CER (argmax)"
137
+ }
138
+ },
139
+ {
140
+ "type": "BeamSearchWERMetric",
141
+ "args": {
142
+ "beam_size": 4,
143
+ "name": "WER (beam search)"
144
+ }
145
+ },
146
+ {
147
+ "type": "BeamSearchCERMetric",
148
+ "args": {
149
+ "beam_size": 4,
150
+ "name": "CER (beam search)"
151
+ }
152
+ },
153
+ {
154
+ "type": "LanguageModelWERMetric",
155
+ "args": {
156
+ "name": "WER (LM)"
157
+ }
158
+ },
159
+ {
160
+ "type": "LanguageModelCERMetric",
161
+ "args": {
162
+ "name": "CER (LM)"
163
+ }
164
+ }
165
+ ],
166
+ "lr_scheduler": {
167
+ "type": "OneCycleLR",
168
+ "args": {
169
+ "steps_per_epoch": 1000,
170
+ "epochs": 50,
171
+ "anneal_strategy": "cos",
172
+ "max_lr": 0.0003,
173
+ "pct_start": 0.1
174
+ }
175
+ },
176
+ "trainer": {
177
+ "epochs": 50,
178
+ "save_dir": "saved/",
179
+ "save_period": 5,
180
+ "verbosity": 2,
181
+ "monitor": "min val_loss",
182
+ "early_stop": 100,
183
+ "visualize": "wandb",
184
+ "wandb_project": "asr_project",
185
+ "len_epoch": 1000,
186
+ "grad_norm_clip": 10
187
+ }
188
+ }
automatic-speech-recognition/default_test_model/config.json ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "default_config",
3
+ "n_gpu": 1,
4
+ "text_encoder": {
5
+ "type": "CTCCharTextEncoder",
6
+ "args": {
7
+ "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa",
8
+ "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt"
9
+ }
10
+ },
11
+ "preprocessing": {
12
+ "sr": 16000,
13
+ "spectrogram": {
14
+ "type": "MelSpectrogram",
15
+ "args": {
16
+ "n_mels": 256
17
+ }
18
+ },
19
+ "log_spec": true
20
+ },
21
+ "augmentations": {
22
+ "random_apply_p": 0.6,
23
+ "wave": [
24
+ {
25
+ "type": "AddColoredNoise",
26
+ "args": {
27
+ "p": 1,
28
+ "sample_rate": 16000
29
+ }
30
+ },
31
+ {
32
+ "type": "Gain",
33
+ "args": {
34
+ "p": 0.8,
35
+ "sample_rate": 16000
36
+ }
37
+ },
38
+ {
39
+ "type": "HighPassFilter",
40
+ "args": {
41
+ "p": 0,
42
+ "sample_rate": 16000
43
+ }
44
+ },
45
+ {
46
+ "type": "LowPassFilter",
47
+ "args": {
48
+ "p": 0,
49
+ "sample_rate": 16000
50
+ }
51
+ },
52
+ {
53
+ "type": "PitchShift",
54
+ "args": {
55
+ "p": 0.8,
56
+ "min_transpose_semitones": -2,
57
+ "max_transpose_semitones": 2,
58
+ "sample_rate": 16000
59
+ }
60
+ },
61
+ {
62
+ "type": "PolarityInversion",
63
+ "args": {
64
+ "p": 0.8,
65
+ "sample_rate": 16000
66
+ }
67
+ },
68
+ {
69
+ "type": "Shift",
70
+ "args": {
71
+ "p": 0.8,
72
+ "sample_rate": 16000
73
+ }
74
+ }
75
+ ],
76
+ "spectrogram": [
77
+ {
78
+ "type": "TimeMasking",
79
+ "args": {
80
+ "time_mask_param": 80,
81
+ "p": 0.05
82
+ }
83
+ },
84
+ {
85
+ "type": "FrequencyMasking",
86
+ "args": {
87
+ "freq_mask_param": 80
88
+ }
89
+ }
90
+ ]
91
+ },
92
+ "arch": {
93
+ "type": "DeepSpeech2Model",
94
+ "args": {
95
+ "n_feats": 256,
96
+ "n_rnn_layers": 6,
97
+ "rnn_hidden_size": 512,
98
+ "rnn_dropout": 0.2
99
+ }
100
+ },
101
+ "data": {
102
+ "train": {
103
+ "batch_size": 128,
104
+ "num_workers": 4,
105
+ "datasets": [
106
+ {
107
+ "type": "LibrispeechDataset",
108
+ "args": {
109
+ "part": "train-clean-100",
110
+ "max_audio_length": 40.0,
111
+ "max_text_length": 400
112
+ }
113
+ },
114
+ {
115
+ "type": "LibrispeechDataset",
116
+ "args": {
117
+ "part": "train-clean-360",
118
+ "max_audio_length": 40.0,
119
+ "max_text_length": 400
120
+ }
121
+ },
122
+ {
123
+ "type": "LibrispeechDataset",
124
+ "args": {
125
+ "part": "train-other-500",
126
+ "max_audio_length": 40.0,
127
+ "max_text_length": 400
128
+ }
129
+ }
130
+ ]
131
+ },
132
+ "val": {
133
+ "batch_size": 64,
134
+ "num_workers": 4,
135
+ "datasets": [
136
+ {
137
+ "type": "LibrispeechDataset",
138
+ "args": {
139
+ "part": "dev-clean"
140
+ }
141
+ }
142
+ ]
143
+ },
144
+ "test-other": {
145
+ "batch_size": 64,
146
+ "num_workers": 4,
147
+ "datasets": [
148
+ {
149
+ "type": "LibrispeechDataset",
150
+ "args": {
151
+ "part": "test-other"
152
+ }
153
+ }
154
+ ]
155
+ },
156
+ "test-clean": {
157
+ "batch_size": 64,
158
+ "num_workers": 4,
159
+ "datasets": [
160
+ {
161
+ "type": "LibrispeechDataset",
162
+ "args": {
163
+ "part": "test-clean"
164
+ }
165
+ }
166
+ ]
167
+ }
168
+ },
169
+ "optimizer": {
170
+ "type": "AdamW",
171
+ "args": {
172
+ "lr": 0.0003,
173
+ "weight_decay": 1e-05
174
+ }
175
+ },
176
+ "loss": {
177
+ "type": "CTCLoss",
178
+ "args": {}
179
+ },
180
+ "metrics": [
181
+ {
182
+ "type": "ArgmaxWERMetric",
183
+ "args": {
184
+ "name": "WER (argmax)"
185
+ }
186
+ },
187
+ {
188
+ "type": "ArgmaxCERMetric",
189
+ "args": {
190
+ "name": "CER (argmax)"
191
+ }
192
+ },
193
+ {
194
+ "type": "BeamSearchWERMetric",
195
+ "args": {
196
+ "beam_size": 4,
197
+ "name": "WER (beam search)"
198
+ }
199
+ },
200
+ {
201
+ "type": "BeamSearchCERMetric",
202
+ "args": {
203
+ "beam_size": 4,
204
+ "name": "CER (beam search)"
205
+ }
206
+ },
207
+ {
208
+ "type": "LanguageModelWERMetric",
209
+ "args": {
210
+ "name": "WER (LM)"
211
+ }
212
+ },
213
+ {
214
+ "type": "LanguageModelCERMetric",
215
+ "args": {
216
+ "name": "CER (LM)"
217
+ }
218
+ }
219
+ ],
220
+ "lr_scheduler": {
221
+ "type": "OneCycleLR",
222
+ "args": {
223
+ "steps_per_epoch": 1000,
224
+ "epochs": 50,
225
+ "anneal_strategy": "cos",
226
+ "max_lr": 0.0003,
227
+ "pct_start": 0.1
228
+ }
229
+ },
230
+ "trainer": {
231
+ "epochs": 50,
232
+ "save_dir": "saved/",
233
+ "save_period": 5,
234
+ "verbosity": 2,
235
+ "monitor": "min val_loss",
236
+ "early_stop": 100,
237
+ "visualize": "wandb",
238
+ "wandb_project": "asr_project",
239
+ "len_epoch": 1000,
240
+ "grad_norm_clip": 10
241
+ }
242
+ }
automatic-speech-recognition/hw_asr/__init__.py ADDED
File without changes
automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (150 Bytes). View file
 
automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (166 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/__init__.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections.abc import Callable
2
+ from typing import List
3
+
4
+ import hw_asr.augmentations.spectrogram_augmentations
5
+ import hw_asr.augmentations.wave_augmentations
6
+ from hw_asr.augmentations.random_choice import RandomChoice
7
+ from hw_asr.augmentations.sequential_random_apply import SequentialRandomApply
8
+ # from hw_asr.augmentations.sequential import SequentialAugmentation
9
+ # from hw_asr.augmentations.random_apply import RandomApply
10
+ from hw_asr.utils.parse_config import ConfigParser
11
+
12
+
13
+ def from_configs(configs: ConfigParser):
14
+ wave_augs = []
15
+ if "augmentations" in configs.config and "wave" in configs.config["augmentations"]:
16
+ for aug_dict in configs.config["augmentations"]["wave"]:
17
+ wave_augs.append(
18
+ configs.init_obj(aug_dict, hw_asr.augmentations.wave_augmentations)
19
+ )
20
+
21
+ spec_augs = []
22
+ if "augmentations" in configs.config and "spectrogram" in configs.config["augmentations"]:
23
+ for aug_dict in configs.config["augmentations"]["spectrogram"]:
24
+ spec_augs.append(
25
+ configs.init_obj(aug_dict, hw_asr.augmentations.spectrogram_augmentations)
26
+ )
27
+ return _to_function(RandomChoice, wave_augs, configs.config["augmentations"]["random_apply_p"]), _to_function(SequentialRandomApply, spec_augs, configs.config["augmentations"]["random_apply_p"])
28
+
29
+
30
+ def _to_function(random_type, augs_list: List[Callable], p: float):
31
+ if len(augs_list) == 0:
32
+ return None
33
+ elif len(augs_list) == 1:
34
+ return augs_list[0]
35
+ else:
36
+ return random_type(augs_list, p)
automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.35 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (2.36 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc ADDED
Binary file (541 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc ADDED
Binary file (710 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc ADDED
Binary file (848 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc ADDED
Binary file (1.22 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc ADDED
Binary file (954 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc ADDED
Binary file (1.36 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc ADDED
Binary file (895 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc ADDED
Binary file (1.19 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc ADDED
Binary file (1.38 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/base.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from torch import Tensor
2
+
3
+
4
+ class AugmentationBase:
5
+ def __call__(self, data: Tensor) -> Tensor:
6
+ raise NotImplementedError()
automatic-speech-recognition/hw_asr/augmentations/random_apply.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from typing import Callable
3
+ from torch import Tensor
4
+
5
+
6
+ class RandomApply:
7
+ def __init__(self, augmentation: Callable, p: float):
8
+ assert 0 <= p <= 1
9
+ self.augmentation = augmentation
10
+ self.p = p
11
+
12
+ def __call__(self, data: Tensor) -> Tensor:
13
+ if random.random() < self.p:
14
+ return self.augmentation(data)
15
+ else:
16
+ return data
automatic-speech-recognition/hw_asr/augmentations/random_choice.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Callable
2
+ from torch import Tensor
3
+ import random
4
+ from hw_asr.augmentations.base import AugmentationBase
5
+
6
+
7
+ class RandomChoice(AugmentationBase):
8
+ def __init__(self, augmentation_list: List[Callable], p: float):
9
+ self.augmentation_list = augmentation_list
10
+ self.p = p
11
+
12
+ def __call__(self, data: Tensor) -> Tensor:
13
+ x = data
14
+ if random.random() < self.p:
15
+ augmentation = random.choice(self.augmentation_list)
16
+ x = augmentation(x)
17
+ return x
automatic-speech-recognition/hw_asr/augmentations/sequential.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Callable
2
+
3
+ from torch import Tensor
4
+
5
+ from hw_asr.augmentations.base import AugmentationBase
6
+
7
+
8
+ class SequentialAugmentation(AugmentationBase):
9
+ def __init__(self, augmentation_list: List[Callable]):
10
+ self.augmentation_list = augmentation_list
11
+
12
+ def __call__(self, data: Tensor) -> Tensor:
13
+ x = data
14
+ for augmentation in self.augmentation_list:
15
+ x = augmentation(x)
16
+ return x
automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Callable
2
+ from torch import Tensor
3
+ import random
4
+ from hw_asr.augmentations.base import AugmentationBase
5
+
6
+
7
+ class SequentialRandomApply(AugmentationBase):
8
+ def __init__(self, augmentation_list: List[Callable], p: float = 0.5):
9
+ self.augmentation_list = augmentation_list
10
+ self.p = p
11
+
12
+ def __call__(self, data: Tensor) -> Tensor:
13
+ x = data
14
+ for augmentation in self.augmentation_list:
15
+ if random.random() < self.p:
16
+ x = augmentation(x)
17
+ return x
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import Tensor
2
+ from hw_asr.augmentations.base import AugmentationBase
3
+ from torchaudio import transforms
4
+
5
+
6
+ class FrequencyMasking(AugmentationBase):
7
+ def __init__(self, *args, **kwargs):
8
+ self._aug = transforms.FrequencyMasking(*args, **kwargs)
9
+
10
+ def __call__(self, spectogram: Tensor):
11
+ return self._aug(spectogram).squeeze(1)
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import Tensor
2
+ from hw_asr.augmentations.base import AugmentationBase
3
+ from torchaudio import transforms
4
+
5
+
6
+ class TimeMasking(AugmentationBase):
7
+ def __init__(self, *args, **kwargs):
8
+ self._aug = transforms.TimeMasking(*args, **kwargs)
9
+
10
+ def __call__(self, spectogram: Tensor):
11
+ return self._aug(spectogram).squeeze(1)
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from hw_asr.augmentations.spectrogram_augmentations.TimeMasking import TimeMasking
2
+ from hw_asr.augmentations.spectrogram_augmentations.FrequencyMasking import FrequencyMasking
3
+
4
+ __all__ = [
5
+ "TimeMasking",
6
+ "FrequencyMasking"
7
+ ]
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc ADDED
Binary file (1.19 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc ADDED
Binary file (1.17 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (190 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (474 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch_audiomentations
2
+ from torch import Tensor
3
+
4
+ from hw_asr.augmentations.base import AugmentationBase
5
+
6
+
7
+ class AddColoredNoise(AugmentationBase):
8
+ def __init__(self, *args, **kwargs):
9
+ self._aug = torch_audiomentations.AddColoredNoise(*args, **kwargs)
10
+
11
+ def __call__(self, data: Tensor):
12
+ x = data.unsqueeze(1)
13
+ return self._aug(x).squeeze(1)
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch_audiomentations
2
+ from torch import Tensor
3
+
4
+ from hw_asr.augmentations.base import AugmentationBase
5
+
6
+
7
+ class Gain(AugmentationBase):
8
+ def __init__(self, *args, **kwargs):
9
+ self._aug = torch_audiomentations.Gain(*args, **kwargs)
10
+
11
+ def __call__(self, data: Tensor):
12
+ x = data.unsqueeze(1)
13
+ return self._aug(x).squeeze(1)
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch_audiomentations
2
+ from torch import Tensor
3
+
4
+ from hw_asr.augmentations.base import AugmentationBase
5
+
6
+
7
+ class HighPassFilter(AugmentationBase):
8
+ def __init__(self, *args, **kwargs):
9
+ self._aug = torch_audiomentations.HighPassFilter(*args, **kwargs)
10
+
11
+ def __call__(self, data: Tensor):
12
+ x = data.unsqueeze(1)
13
+ return self._aug(x).squeeze(1)
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch_audiomentations
2
+ from torch import Tensor
3
+
4
+ from hw_asr.augmentations.base import AugmentationBase
5
+
6
+
7
+ class LowPassFilter(AugmentationBase):
8
+ def __init__(self, *args, **kwargs):
9
+ self._aug = torch_audiomentations.LowPassFilter(*args, **kwargs)
10
+
11
+ def __call__(self, data: Tensor):
12
+ x = data.unsqueeze(1)
13
+ return self._aug(x).squeeze(1)
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch_audiomentations
2
+ from torch import Tensor
3
+
4
+ from hw_asr.augmentations.base import AugmentationBase
5
+
6
+
7
+ class Padding(AugmentationBase):
8
+ def __init__(self, *args, **kwargs):
9
+ self._aug = torch_audiomentations.Padding(*args, **kwargs)
10
+
11
+ def __call__(self, data: Tensor):
12
+ x = data.unsqueeze(1)
13
+ return self._aug(x).squeeze(1)
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch_audiomentations
2
+ from torch import Tensor
3
+
4
+ from hw_asr.augmentations.base import AugmentationBase
5
+
6
+
7
+ class PitchShift(AugmentationBase):
8
+ def __init__(self, *args, **kwargs):
9
+ self._aug = torch_audiomentations.PitchShift(*args, **kwargs)
10
+
11
+ def __call__(self, data: Tensor):
12
+ x = data.unsqueeze(1)
13
+ return self._aug(x).squeeze(1)
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch_audiomentations
2
+ from torch import Tensor
3
+
4
+ from hw_asr.augmentations.base import AugmentationBase
5
+
6
+
7
+ class PolarityInversion(AugmentationBase):
8
+ def __init__(self, *args, **kwargs):
9
+ self._aug = torch_audiomentations.PolarityInversion(*args, **kwargs)
10
+
11
+ def __call__(self, data: Tensor):
12
+ x = data.unsqueeze(1)
13
+ return self._aug(x).squeeze(1)
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch_audiomentations
2
+ from torch import Tensor
3
+
4
+ from hw_asr.augmentations.base import AugmentationBase
5
+
6
+
7
+ class Shift(AugmentationBase):
8
+ def __init__(self, *args, **kwargs):
9
+ self._aug = torch_audiomentations.Shift(*args, **kwargs)
10
+
11
+ def __call__(self, data: Tensor):
12
+ x = data.unsqueeze(1)
13
+ return self._aug(x).squeeze(1)
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from hw_asr.augmentations.wave_augmentations.AddColoredNoise import AddColoredNoise
2
+ from hw_asr.augmentations.wave_augmentations.Gain import Gain
3
+ from hw_asr.augmentations.wave_augmentations.HighPassFilter import HighPassFilter
4
+ from hw_asr.augmentations.wave_augmentations.LowPassFilter import LowPassFilter
5
+ # from hw_asr.augmentations.wave_augmentations.Padding import Padding
6
+ from hw_asr.augmentations.wave_augmentations.PitchShift import PitchShift
7
+ from hw_asr.augmentations.wave_augmentations.PolarityInversion import PolarityInversion
8
+ from hw_asr.augmentations.wave_augmentations.Shift import Shift
9
+
10
+ __all__ = [
11
+ "AddColoredNoise",
12
+ "Gain",
13
+ "HighPassFilter"
14
+ "LowPassFilter",
15
+ # "Padding",
16
+ "PitchShift",
17
+ "PolarityInversion",
18
+ "Shift"
19
+ ]
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc ADDED
Binary file (863 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc ADDED
Binary file (1.23 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc ADDED
Binary file (819 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc ADDED
Binary file (1.19 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc ADDED
Binary file (859 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc ADDED
Binary file (1.23 kB). View file
 
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc ADDED
Binary file (855 Bytes). View file
 
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc ADDED
Binary file (1.22 kB). View file