Upload 198 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- automatic-speech-recognition/Dockerfile +15 -0
- automatic-speech-recognition/LICENSE +21 -0
- automatic-speech-recognition/README.md +21 -0
- automatic-speech-recognition/checkpoint.pth +3 -0
- automatic-speech-recognition/default_test_config.json +188 -0
- automatic-speech-recognition/default_test_model/config.json +242 -0
- automatic-speech-recognition/hw_asr/__init__.py +0 -0
- automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__init__.py +36 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/base.py +6 -0
- automatic-speech-recognition/hw_asr/augmentations/random_apply.py +16 -0
- automatic-speech-recognition/hw_asr/augmentations/random_choice.py +17 -0
- automatic-speech-recognition/hw_asr/augmentations/sequential.py +16 -0
- automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py +17 -0
- automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py +11 -0
- automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py +11 -0
- automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py +7 -0
- automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py +13 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py +13 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py +13 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py +13 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py +13 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py +13 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py +13 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py +13 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py +19 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc +0 -0
- automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc +0 -0
automatic-speech-recognition/Dockerfile
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-devel
|
2 |
+
WORKDIR /repos/asr_project_template
|
3 |
+
|
4 |
+
# Install requirements for torchaudio
|
5 |
+
RUN pip install sox && conda install torchaudio==0.11.0 -c pytorch && conda install -c conda-forge librosa
|
6 |
+
|
7 |
+
# Install requirements
|
8 |
+
COPY requirements.txt ./
|
9 |
+
RUN pip install -r requirements.txt
|
10 |
+
|
11 |
+
# Copy the contents of repository
|
12 |
+
COPY . .
|
13 |
+
|
14 |
+
# Expose port
|
15 |
+
EXPOSE 3000
|
automatic-speech-recognition/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2021 Daniil Ivanov
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
automatic-speech-recognition/README.md
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ASR project barebones
|
2 |
+
|
3 |
+
## Installation guide
|
4 |
+
|
5 |
+
1. `pip install -r ./requirements.txt`
|
6 |
+
2. Download from http://www.openslr.org/11/ `3-gram.arpa.gz` and `librispeech-vocab.txt`
|
7 |
+
3. `python hw_asr/text_encoder/fix_vocab.py` and `python hw_asr/text_encoder/lower_model.py` to prepare vocab and model for using
|
8 |
+
4. If you want to test my model, download it from the https://drive.google.com/file/d/1QrSsx56V5YNjGHUBWy6CIRVbNbjKWUpJ/view?usp=share_link , name it `checkpoint.pth` and place to the directory `default_test_model/`
|
9 |
+
|
10 |
+
## Train
|
11 |
+
|
12 |
+
1. `python train.py --config hw_asr/configs/config2.json`
|
13 |
+
|
14 |
+
## Test
|
15 |
+
|
16 |
+
1. `python test.py -c default_test_config.json -r default_test_model/checkpoint.pth`
|
17 |
+
|
18 |
+
## Wandb report
|
19 |
+
|
20 |
+
1. You can check my wandb report (only on Russian) and wandb project from the https://wandb.ai/tgritsaev/asr_project/reports/DLA-HW-1--Vmlldzo1NzY3NjA5?accessToken=kotkj5oyzomf2d2g1f40mczdnpirwvuw1f538zx9k491g1cfh3wg9iwhsb65o054
|
21 |
+
|
automatic-speech-recognition/checkpoint.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f1bebf1c95bb69c3130757b652c9fa4a975302b142a0b3d779d36ba404905ac
|
3 |
+
size 333205079
|
automatic-speech-recognition/default_test_config.json
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "default_test_config",
|
3 |
+
"n_gpu": 1,
|
4 |
+
"text_encoder": {
|
5 |
+
"type": "CTCCharTextEncoder",
|
6 |
+
"args": {
|
7 |
+
"kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa",
|
8 |
+
"unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"preprocessing": {
|
12 |
+
"sr": 16000,
|
13 |
+
"spectrogram": {
|
14 |
+
"type": "MelSpectrogram",
|
15 |
+
"args": {
|
16 |
+
"n_mels": 256
|
17 |
+
}
|
18 |
+
},
|
19 |
+
"log_spec": true
|
20 |
+
},
|
21 |
+
"augmentations": {
|
22 |
+
"random_apply_p": 0.6,
|
23 |
+
"wave": [
|
24 |
+
{
|
25 |
+
"type": "AddColoredNoise",
|
26 |
+
"args": {
|
27 |
+
"p": 1,
|
28 |
+
"sample_rate": 16000
|
29 |
+
}
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"type": "Gain",
|
33 |
+
"args": {
|
34 |
+
"p": 0.8,
|
35 |
+
"sample_rate": 16000
|
36 |
+
}
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"type": "HighPassFilter",
|
40 |
+
"args": {
|
41 |
+
"p": 0,
|
42 |
+
"sample_rate": 16000
|
43 |
+
}
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"type": "LowPassFilter",
|
47 |
+
"args": {
|
48 |
+
"p": 0,
|
49 |
+
"sample_rate": 16000
|
50 |
+
}
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"type": "PitchShift",
|
54 |
+
"args": {
|
55 |
+
"p": 0.8,
|
56 |
+
"min_transpose_semitones": -2,
|
57 |
+
"max_transpose_semitones": 2,
|
58 |
+
"sample_rate": 16000
|
59 |
+
}
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"type": "PolarityInversion",
|
63 |
+
"args": {
|
64 |
+
"p": 0.8,
|
65 |
+
"sample_rate": 16000
|
66 |
+
}
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"type": "Shift",
|
70 |
+
"args": {
|
71 |
+
"p": 0.8,
|
72 |
+
"sample_rate": 16000
|
73 |
+
}
|
74 |
+
}
|
75 |
+
],
|
76 |
+
"spectrogram": [
|
77 |
+
{
|
78 |
+
"type": "TimeMasking",
|
79 |
+
"args": {
|
80 |
+
"time_mask_param": 80,
|
81 |
+
"p": 0.05
|
82 |
+
}
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"type": "FrequencyMasking",
|
86 |
+
"args": {
|
87 |
+
"freq_mask_param": 80
|
88 |
+
}
|
89 |
+
}
|
90 |
+
]
|
91 |
+
},
|
92 |
+
"arch": {
|
93 |
+
"type": "DeepSpeech2Model",
|
94 |
+
"args": {
|
95 |
+
"n_feats": 256,
|
96 |
+
"n_rnn_layers": 6,
|
97 |
+
"rnn_hidden_size": 512,
|
98 |
+
"rnn_dropout": 0.2
|
99 |
+
}
|
100 |
+
},
|
101 |
+
"data": {
|
102 |
+
"test": {
|
103 |
+
"batch_size": 64,
|
104 |
+
"num_workers": 4,
|
105 |
+
"datasets": [
|
106 |
+
{
|
107 |
+
"type": "LibrispeechDataset",
|
108 |
+
"args": {
|
109 |
+
"part": "test-other"
|
110 |
+
}
|
111 |
+
}
|
112 |
+
]
|
113 |
+
}
|
114 |
+
},
|
115 |
+
"optimizer": {
|
116 |
+
"type": "AdamW",
|
117 |
+
"args": {
|
118 |
+
"lr": 0.0003,
|
119 |
+
"weight_decay": 1e-05
|
120 |
+
}
|
121 |
+
},
|
122 |
+
"loss": {
|
123 |
+
"type": "CTCLoss",
|
124 |
+
"args": {}
|
125 |
+
},
|
126 |
+
"metrics": [
|
127 |
+
{
|
128 |
+
"type": "ArgmaxWERMetric",
|
129 |
+
"args": {
|
130 |
+
"name": "WER (argmax)"
|
131 |
+
}
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"type": "ArgmaxCERMetric",
|
135 |
+
"args": {
|
136 |
+
"name": "CER (argmax)"
|
137 |
+
}
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"type": "BeamSearchWERMetric",
|
141 |
+
"args": {
|
142 |
+
"beam_size": 4,
|
143 |
+
"name": "WER (beam search)"
|
144 |
+
}
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"type": "BeamSearchCERMetric",
|
148 |
+
"args": {
|
149 |
+
"beam_size": 4,
|
150 |
+
"name": "CER (beam search)"
|
151 |
+
}
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"type": "LanguageModelWERMetric",
|
155 |
+
"args": {
|
156 |
+
"name": "WER (LM)"
|
157 |
+
}
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"type": "LanguageModelCERMetric",
|
161 |
+
"args": {
|
162 |
+
"name": "CER (LM)"
|
163 |
+
}
|
164 |
+
}
|
165 |
+
],
|
166 |
+
"lr_scheduler": {
|
167 |
+
"type": "OneCycleLR",
|
168 |
+
"args": {
|
169 |
+
"steps_per_epoch": 1000,
|
170 |
+
"epochs": 50,
|
171 |
+
"anneal_strategy": "cos",
|
172 |
+
"max_lr": 0.0003,
|
173 |
+
"pct_start": 0.1
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"trainer": {
|
177 |
+
"epochs": 50,
|
178 |
+
"save_dir": "saved/",
|
179 |
+
"save_period": 5,
|
180 |
+
"verbosity": 2,
|
181 |
+
"monitor": "min val_loss",
|
182 |
+
"early_stop": 100,
|
183 |
+
"visualize": "wandb",
|
184 |
+
"wandb_project": "asr_project",
|
185 |
+
"len_epoch": 1000,
|
186 |
+
"grad_norm_clip": 10
|
187 |
+
}
|
188 |
+
}
|
automatic-speech-recognition/default_test_model/config.json
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "default_config",
|
3 |
+
"n_gpu": 1,
|
4 |
+
"text_encoder": {
|
5 |
+
"type": "CTCCharTextEncoder",
|
6 |
+
"args": {
|
7 |
+
"kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa",
|
8 |
+
"unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"preprocessing": {
|
12 |
+
"sr": 16000,
|
13 |
+
"spectrogram": {
|
14 |
+
"type": "MelSpectrogram",
|
15 |
+
"args": {
|
16 |
+
"n_mels": 256
|
17 |
+
}
|
18 |
+
},
|
19 |
+
"log_spec": true
|
20 |
+
},
|
21 |
+
"augmentations": {
|
22 |
+
"random_apply_p": 0.6,
|
23 |
+
"wave": [
|
24 |
+
{
|
25 |
+
"type": "AddColoredNoise",
|
26 |
+
"args": {
|
27 |
+
"p": 1,
|
28 |
+
"sample_rate": 16000
|
29 |
+
}
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"type": "Gain",
|
33 |
+
"args": {
|
34 |
+
"p": 0.8,
|
35 |
+
"sample_rate": 16000
|
36 |
+
}
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"type": "HighPassFilter",
|
40 |
+
"args": {
|
41 |
+
"p": 0,
|
42 |
+
"sample_rate": 16000
|
43 |
+
}
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"type": "LowPassFilter",
|
47 |
+
"args": {
|
48 |
+
"p": 0,
|
49 |
+
"sample_rate": 16000
|
50 |
+
}
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"type": "PitchShift",
|
54 |
+
"args": {
|
55 |
+
"p": 0.8,
|
56 |
+
"min_transpose_semitones": -2,
|
57 |
+
"max_transpose_semitones": 2,
|
58 |
+
"sample_rate": 16000
|
59 |
+
}
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"type": "PolarityInversion",
|
63 |
+
"args": {
|
64 |
+
"p": 0.8,
|
65 |
+
"sample_rate": 16000
|
66 |
+
}
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"type": "Shift",
|
70 |
+
"args": {
|
71 |
+
"p": 0.8,
|
72 |
+
"sample_rate": 16000
|
73 |
+
}
|
74 |
+
}
|
75 |
+
],
|
76 |
+
"spectrogram": [
|
77 |
+
{
|
78 |
+
"type": "TimeMasking",
|
79 |
+
"args": {
|
80 |
+
"time_mask_param": 80,
|
81 |
+
"p": 0.05
|
82 |
+
}
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"type": "FrequencyMasking",
|
86 |
+
"args": {
|
87 |
+
"freq_mask_param": 80
|
88 |
+
}
|
89 |
+
}
|
90 |
+
]
|
91 |
+
},
|
92 |
+
"arch": {
|
93 |
+
"type": "DeepSpeech2Model",
|
94 |
+
"args": {
|
95 |
+
"n_feats": 256,
|
96 |
+
"n_rnn_layers": 6,
|
97 |
+
"rnn_hidden_size": 512,
|
98 |
+
"rnn_dropout": 0.2
|
99 |
+
}
|
100 |
+
},
|
101 |
+
"data": {
|
102 |
+
"train": {
|
103 |
+
"batch_size": 128,
|
104 |
+
"num_workers": 4,
|
105 |
+
"datasets": [
|
106 |
+
{
|
107 |
+
"type": "LibrispeechDataset",
|
108 |
+
"args": {
|
109 |
+
"part": "train-clean-100",
|
110 |
+
"max_audio_length": 40.0,
|
111 |
+
"max_text_length": 400
|
112 |
+
}
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"type": "LibrispeechDataset",
|
116 |
+
"args": {
|
117 |
+
"part": "train-clean-360",
|
118 |
+
"max_audio_length": 40.0,
|
119 |
+
"max_text_length": 400
|
120 |
+
}
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"type": "LibrispeechDataset",
|
124 |
+
"args": {
|
125 |
+
"part": "train-other-500",
|
126 |
+
"max_audio_length": 40.0,
|
127 |
+
"max_text_length": 400
|
128 |
+
}
|
129 |
+
}
|
130 |
+
]
|
131 |
+
},
|
132 |
+
"val": {
|
133 |
+
"batch_size": 64,
|
134 |
+
"num_workers": 4,
|
135 |
+
"datasets": [
|
136 |
+
{
|
137 |
+
"type": "LibrispeechDataset",
|
138 |
+
"args": {
|
139 |
+
"part": "dev-clean"
|
140 |
+
}
|
141 |
+
}
|
142 |
+
]
|
143 |
+
},
|
144 |
+
"test-other": {
|
145 |
+
"batch_size": 64,
|
146 |
+
"num_workers": 4,
|
147 |
+
"datasets": [
|
148 |
+
{
|
149 |
+
"type": "LibrispeechDataset",
|
150 |
+
"args": {
|
151 |
+
"part": "test-other"
|
152 |
+
}
|
153 |
+
}
|
154 |
+
]
|
155 |
+
},
|
156 |
+
"test-clean": {
|
157 |
+
"batch_size": 64,
|
158 |
+
"num_workers": 4,
|
159 |
+
"datasets": [
|
160 |
+
{
|
161 |
+
"type": "LibrispeechDataset",
|
162 |
+
"args": {
|
163 |
+
"part": "test-clean"
|
164 |
+
}
|
165 |
+
}
|
166 |
+
]
|
167 |
+
}
|
168 |
+
},
|
169 |
+
"optimizer": {
|
170 |
+
"type": "AdamW",
|
171 |
+
"args": {
|
172 |
+
"lr": 0.0003,
|
173 |
+
"weight_decay": 1e-05
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"loss": {
|
177 |
+
"type": "CTCLoss",
|
178 |
+
"args": {}
|
179 |
+
},
|
180 |
+
"metrics": [
|
181 |
+
{
|
182 |
+
"type": "ArgmaxWERMetric",
|
183 |
+
"args": {
|
184 |
+
"name": "WER (argmax)"
|
185 |
+
}
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"type": "ArgmaxCERMetric",
|
189 |
+
"args": {
|
190 |
+
"name": "CER (argmax)"
|
191 |
+
}
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"type": "BeamSearchWERMetric",
|
195 |
+
"args": {
|
196 |
+
"beam_size": 4,
|
197 |
+
"name": "WER (beam search)"
|
198 |
+
}
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"type": "BeamSearchCERMetric",
|
202 |
+
"args": {
|
203 |
+
"beam_size": 4,
|
204 |
+
"name": "CER (beam search)"
|
205 |
+
}
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"type": "LanguageModelWERMetric",
|
209 |
+
"args": {
|
210 |
+
"name": "WER (LM)"
|
211 |
+
}
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"type": "LanguageModelCERMetric",
|
215 |
+
"args": {
|
216 |
+
"name": "CER (LM)"
|
217 |
+
}
|
218 |
+
}
|
219 |
+
],
|
220 |
+
"lr_scheduler": {
|
221 |
+
"type": "OneCycleLR",
|
222 |
+
"args": {
|
223 |
+
"steps_per_epoch": 1000,
|
224 |
+
"epochs": 50,
|
225 |
+
"anneal_strategy": "cos",
|
226 |
+
"max_lr": 0.0003,
|
227 |
+
"pct_start": 0.1
|
228 |
+
}
|
229 |
+
},
|
230 |
+
"trainer": {
|
231 |
+
"epochs": 50,
|
232 |
+
"save_dir": "saved/",
|
233 |
+
"save_period": 5,
|
234 |
+
"verbosity": 2,
|
235 |
+
"monitor": "min val_loss",
|
236 |
+
"early_stop": 100,
|
237 |
+
"visualize": "wandb",
|
238 |
+
"wandb_project": "asr_project",
|
239 |
+
"len_epoch": 1000,
|
240 |
+
"grad_norm_clip": 10
|
241 |
+
}
|
242 |
+
}
|
automatic-speech-recognition/hw_asr/__init__.py
ADDED
File without changes
|
automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (150 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (166 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__init__.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections.abc import Callable
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
import hw_asr.augmentations.spectrogram_augmentations
|
5 |
+
import hw_asr.augmentations.wave_augmentations
|
6 |
+
from hw_asr.augmentations.random_choice import RandomChoice
|
7 |
+
from hw_asr.augmentations.sequential_random_apply import SequentialRandomApply
|
8 |
+
# from hw_asr.augmentations.sequential import SequentialAugmentation
|
9 |
+
# from hw_asr.augmentations.random_apply import RandomApply
|
10 |
+
from hw_asr.utils.parse_config import ConfigParser
|
11 |
+
|
12 |
+
|
13 |
+
def from_configs(configs: ConfigParser):
|
14 |
+
wave_augs = []
|
15 |
+
if "augmentations" in configs.config and "wave" in configs.config["augmentations"]:
|
16 |
+
for aug_dict in configs.config["augmentations"]["wave"]:
|
17 |
+
wave_augs.append(
|
18 |
+
configs.init_obj(aug_dict, hw_asr.augmentations.wave_augmentations)
|
19 |
+
)
|
20 |
+
|
21 |
+
spec_augs = []
|
22 |
+
if "augmentations" in configs.config and "spectrogram" in configs.config["augmentations"]:
|
23 |
+
for aug_dict in configs.config["augmentations"]["spectrogram"]:
|
24 |
+
spec_augs.append(
|
25 |
+
configs.init_obj(aug_dict, hw_asr.augmentations.spectrogram_augmentations)
|
26 |
+
)
|
27 |
+
return _to_function(RandomChoice, wave_augs, configs.config["augmentations"]["random_apply_p"]), _to_function(SequentialRandomApply, spec_augs, configs.config["augmentations"]["random_apply_p"])
|
28 |
+
|
29 |
+
|
30 |
+
def _to_function(random_type, augs_list: List[Callable], p: float):
|
31 |
+
if len(augs_list) == 0:
|
32 |
+
return None
|
33 |
+
elif len(augs_list) == 1:
|
34 |
+
return augs_list[0]
|
35 |
+
else:
|
36 |
+
return random_type(augs_list, p)
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (1.35 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (2.36 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc
ADDED
Binary file (541 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc
ADDED
Binary file (710 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc
ADDED
Binary file (848 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc
ADDED
Binary file (1.22 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc
ADDED
Binary file (954 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc
ADDED
Binary file (1.36 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc
ADDED
Binary file (895 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc
ADDED
Binary file (1.19 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc
ADDED
Binary file (1.38 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/base.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import Tensor
|
2 |
+
|
3 |
+
|
4 |
+
class AugmentationBase:
|
5 |
+
def __call__(self, data: Tensor) -> Tensor:
|
6 |
+
raise NotImplementedError()
|
automatic-speech-recognition/hw_asr/augmentations/random_apply.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from typing import Callable
|
3 |
+
from torch import Tensor
|
4 |
+
|
5 |
+
|
6 |
+
class RandomApply:
|
7 |
+
def __init__(self, augmentation: Callable, p: float):
|
8 |
+
assert 0 <= p <= 1
|
9 |
+
self.augmentation = augmentation
|
10 |
+
self.p = p
|
11 |
+
|
12 |
+
def __call__(self, data: Tensor) -> Tensor:
|
13 |
+
if random.random() < self.p:
|
14 |
+
return self.augmentation(data)
|
15 |
+
else:
|
16 |
+
return data
|
automatic-speech-recognition/hw_asr/augmentations/random_choice.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Callable
|
2 |
+
from torch import Tensor
|
3 |
+
import random
|
4 |
+
from hw_asr.augmentations.base import AugmentationBase
|
5 |
+
|
6 |
+
|
7 |
+
class RandomChoice(AugmentationBase):
|
8 |
+
def __init__(self, augmentation_list: List[Callable], p: float):
|
9 |
+
self.augmentation_list = augmentation_list
|
10 |
+
self.p = p
|
11 |
+
|
12 |
+
def __call__(self, data: Tensor) -> Tensor:
|
13 |
+
x = data
|
14 |
+
if random.random() < self.p:
|
15 |
+
augmentation = random.choice(self.augmentation_list)
|
16 |
+
x = augmentation(x)
|
17 |
+
return x
|
automatic-speech-recognition/hw_asr/augmentations/sequential.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Callable
|
2 |
+
|
3 |
+
from torch import Tensor
|
4 |
+
|
5 |
+
from hw_asr.augmentations.base import AugmentationBase
|
6 |
+
|
7 |
+
|
8 |
+
class SequentialAugmentation(AugmentationBase):
|
9 |
+
def __init__(self, augmentation_list: List[Callable]):
|
10 |
+
self.augmentation_list = augmentation_list
|
11 |
+
|
12 |
+
def __call__(self, data: Tensor) -> Tensor:
|
13 |
+
x = data
|
14 |
+
for augmentation in self.augmentation_list:
|
15 |
+
x = augmentation(x)
|
16 |
+
return x
|
automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Callable
|
2 |
+
from torch import Tensor
|
3 |
+
import random
|
4 |
+
from hw_asr.augmentations.base import AugmentationBase
|
5 |
+
|
6 |
+
|
7 |
+
class SequentialRandomApply(AugmentationBase):
|
8 |
+
def __init__(self, augmentation_list: List[Callable], p: float = 0.5):
|
9 |
+
self.augmentation_list = augmentation_list
|
10 |
+
self.p = p
|
11 |
+
|
12 |
+
def __call__(self, data: Tensor) -> Tensor:
|
13 |
+
x = data
|
14 |
+
for augmentation in self.augmentation_list:
|
15 |
+
if random.random() < self.p:
|
16 |
+
x = augmentation(x)
|
17 |
+
return x
|
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import Tensor
|
2 |
+
from hw_asr.augmentations.base import AugmentationBase
|
3 |
+
from torchaudio import transforms
|
4 |
+
|
5 |
+
|
6 |
+
class FrequencyMasking(AugmentationBase):
|
7 |
+
def __init__(self, *args, **kwargs):
|
8 |
+
self._aug = transforms.FrequencyMasking(*args, **kwargs)
|
9 |
+
|
10 |
+
def __call__(self, spectogram: Tensor):
|
11 |
+
return self._aug(spectogram).squeeze(1)
|
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import Tensor
|
2 |
+
from hw_asr.augmentations.base import AugmentationBase
|
3 |
+
from torchaudio import transforms
|
4 |
+
|
5 |
+
|
6 |
+
class TimeMasking(AugmentationBase):
|
7 |
+
def __init__(self, *args, **kwargs):
|
8 |
+
self._aug = transforms.TimeMasking(*args, **kwargs)
|
9 |
+
|
10 |
+
def __call__(self, spectogram: Tensor):
|
11 |
+
return self._aug(spectogram).squeeze(1)
|
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from hw_asr.augmentations.spectrogram_augmentations.TimeMasking import TimeMasking
|
2 |
+
from hw_asr.augmentations.spectrogram_augmentations.FrequencyMasking import FrequencyMasking
|
3 |
+
|
4 |
+
__all__ = [
|
5 |
+
"TimeMasking",
|
6 |
+
"FrequencyMasking"
|
7 |
+
]
|
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc
ADDED
Binary file (1.19 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc
ADDED
Binary file (1.17 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (190 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (474 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch_audiomentations
|
2 |
+
from torch import Tensor
|
3 |
+
|
4 |
+
from hw_asr.augmentations.base import AugmentationBase
|
5 |
+
|
6 |
+
|
7 |
+
class AddColoredNoise(AugmentationBase):
|
8 |
+
def __init__(self, *args, **kwargs):
|
9 |
+
self._aug = torch_audiomentations.AddColoredNoise(*args, **kwargs)
|
10 |
+
|
11 |
+
def __call__(self, data: Tensor):
|
12 |
+
x = data.unsqueeze(1)
|
13 |
+
return self._aug(x).squeeze(1)
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch_audiomentations
|
2 |
+
from torch import Tensor
|
3 |
+
|
4 |
+
from hw_asr.augmentations.base import AugmentationBase
|
5 |
+
|
6 |
+
|
7 |
+
class Gain(AugmentationBase):
|
8 |
+
def __init__(self, *args, **kwargs):
|
9 |
+
self._aug = torch_audiomentations.Gain(*args, **kwargs)
|
10 |
+
|
11 |
+
def __call__(self, data: Tensor):
|
12 |
+
x = data.unsqueeze(1)
|
13 |
+
return self._aug(x).squeeze(1)
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch_audiomentations
|
2 |
+
from torch import Tensor
|
3 |
+
|
4 |
+
from hw_asr.augmentations.base import AugmentationBase
|
5 |
+
|
6 |
+
|
7 |
+
class HighPassFilter(AugmentationBase):
|
8 |
+
def __init__(self, *args, **kwargs):
|
9 |
+
self._aug = torch_audiomentations.HighPassFilter(*args, **kwargs)
|
10 |
+
|
11 |
+
def __call__(self, data: Tensor):
|
12 |
+
x = data.unsqueeze(1)
|
13 |
+
return self._aug(x).squeeze(1)
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch_audiomentations
|
2 |
+
from torch import Tensor
|
3 |
+
|
4 |
+
from hw_asr.augmentations.base import AugmentationBase
|
5 |
+
|
6 |
+
|
7 |
+
class LowPassFilter(AugmentationBase):
|
8 |
+
def __init__(self, *args, **kwargs):
|
9 |
+
self._aug = torch_audiomentations.LowPassFilter(*args, **kwargs)
|
10 |
+
|
11 |
+
def __call__(self, data: Tensor):
|
12 |
+
x = data.unsqueeze(1)
|
13 |
+
return self._aug(x).squeeze(1)
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch_audiomentations
|
2 |
+
from torch import Tensor
|
3 |
+
|
4 |
+
from hw_asr.augmentations.base import AugmentationBase
|
5 |
+
|
6 |
+
|
7 |
+
class Padding(AugmentationBase):
|
8 |
+
def __init__(self, *args, **kwargs):
|
9 |
+
self._aug = torch_audiomentations.Padding(*args, **kwargs)
|
10 |
+
|
11 |
+
def __call__(self, data: Tensor):
|
12 |
+
x = data.unsqueeze(1)
|
13 |
+
return self._aug(x).squeeze(1)
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch_audiomentations
|
2 |
+
from torch import Tensor
|
3 |
+
|
4 |
+
from hw_asr.augmentations.base import AugmentationBase
|
5 |
+
|
6 |
+
|
7 |
+
class PitchShift(AugmentationBase):
|
8 |
+
def __init__(self, *args, **kwargs):
|
9 |
+
self._aug = torch_audiomentations.PitchShift(*args, **kwargs)
|
10 |
+
|
11 |
+
def __call__(self, data: Tensor):
|
12 |
+
x = data.unsqueeze(1)
|
13 |
+
return self._aug(x).squeeze(1)
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch_audiomentations
|
2 |
+
from torch import Tensor
|
3 |
+
|
4 |
+
from hw_asr.augmentations.base import AugmentationBase
|
5 |
+
|
6 |
+
|
7 |
+
class PolarityInversion(AugmentationBase):
|
8 |
+
def __init__(self, *args, **kwargs):
|
9 |
+
self._aug = torch_audiomentations.PolarityInversion(*args, **kwargs)
|
10 |
+
|
11 |
+
def __call__(self, data: Tensor):
|
12 |
+
x = data.unsqueeze(1)
|
13 |
+
return self._aug(x).squeeze(1)
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch_audiomentations
|
2 |
+
from torch import Tensor
|
3 |
+
|
4 |
+
from hw_asr.augmentations.base import AugmentationBase
|
5 |
+
|
6 |
+
|
7 |
+
class Shift(AugmentationBase):
|
8 |
+
def __init__(self, *args, **kwargs):
|
9 |
+
self._aug = torch_audiomentations.Shift(*args, **kwargs)
|
10 |
+
|
11 |
+
def __call__(self, data: Tensor):
|
12 |
+
x = data.unsqueeze(1)
|
13 |
+
return self._aug(x).squeeze(1)
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from hw_asr.augmentations.wave_augmentations.AddColoredNoise import AddColoredNoise
|
2 |
+
from hw_asr.augmentations.wave_augmentations.Gain import Gain
|
3 |
+
from hw_asr.augmentations.wave_augmentations.HighPassFilter import HighPassFilter
|
4 |
+
from hw_asr.augmentations.wave_augmentations.LowPassFilter import LowPassFilter
|
5 |
+
# from hw_asr.augmentations.wave_augmentations.Padding import Padding
|
6 |
+
from hw_asr.augmentations.wave_augmentations.PitchShift import PitchShift
|
7 |
+
from hw_asr.augmentations.wave_augmentations.PolarityInversion import PolarityInversion
|
8 |
+
from hw_asr.augmentations.wave_augmentations.Shift import Shift
|
9 |
+
|
10 |
+
__all__ = [
|
11 |
+
"AddColoredNoise",
|
12 |
+
"Gain",
|
13 |
+
"HighPassFilter"
|
14 |
+
"LowPassFilter",
|
15 |
+
# "Padding",
|
16 |
+
"PitchShift",
|
17 |
+
"PolarityInversion",
|
18 |
+
"Shift"
|
19 |
+
]
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc
ADDED
Binary file (863 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc
ADDED
Binary file (1.23 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc
ADDED
Binary file (819 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc
ADDED
Binary file (1.19 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc
ADDED
Binary file (859 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc
ADDED
Binary file (1.23 kB). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc
ADDED
Binary file (855 Bytes). View file
|
|
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc
ADDED
Binary file (1.22 kB). View file
|
|