diff --git a/automatic-speech-recognition/Dockerfile b/automatic-speech-recognition/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..f2d36ccf9701bda2c40a81fae8ae064c45402b13 --- /dev/null +++ b/automatic-speech-recognition/Dockerfile @@ -0,0 +1,15 @@ +FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-devel +WORKDIR /repos/asr_project_template + +# Install requirements for torchaudio +RUN pip install sox && conda install torchaudio==0.11.0 -c pytorch && conda install -c conda-forge librosa + +# Install requirements +COPY requirements.txt ./ +RUN pip install -r requirements.txt + +# Copy the contents of repository +COPY . . + +# Expose port +EXPOSE 3000 \ No newline at end of file diff --git a/automatic-speech-recognition/LICENSE b/automatic-speech-recognition/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..bb62e4c42ecc1094e674db1a0c98696bb7e48ef4 --- /dev/null +++ b/automatic-speech-recognition/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Daniil Ivanov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/automatic-speech-recognition/README.md b/automatic-speech-recognition/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2dca3a3410855cf9e04f1c3b0dcadfc24a4adeb0 --- /dev/null +++ b/automatic-speech-recognition/README.md @@ -0,0 +1,21 @@ +# ASR project barebones + +## Installation guide + +1. `pip install -r ./requirements.txt` +2. Download from http://www.openslr.org/11/ `3-gram.arpa.gz` and `librispeech-vocab.txt` +3. `python hw_asr/text_encoder/fix_vocab.py` and `python hw_asr/text_encoder/lower_model.py` to prepare vocab and model for using +4. If you want to test my model, download it from the https://drive.google.com/file/d/1QrSsx56V5YNjGHUBWy6CIRVbNbjKWUpJ/view?usp=share_link , name it `checkpoint.pth` and place to the directory `default_test_model/` + +## Train + +1. `python train.py --config hw_asr/configs/config2.json` + +## Test + +1. `python test.py -c default_test_config.json -r default_test_model/checkpoint.pth` + +## Wandb report + +1. You can check my wandb report (only on Russian) and wandb project from the https://wandb.ai/tgritsaev/asr_project/reports/DLA-HW-1--Vmlldzo1NzY3NjA5?accessToken=kotkj5oyzomf2d2g1f40mczdnpirwvuw1f538zx9k491g1cfh3wg9iwhsb65o054 + diff --git a/automatic-speech-recognition/checkpoint.pth b/automatic-speech-recognition/checkpoint.pth new file mode 100644 index 0000000000000000000000000000000000000000..daf31cbefb1e62f163627551c41883c4982ab1d9 --- /dev/null +++ b/automatic-speech-recognition/checkpoint.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f1bebf1c95bb69c3130757b652c9fa4a975302b142a0b3d779d36ba404905ac +size 333205079 diff --git a/automatic-speech-recognition/default_test_config.json b/automatic-speech-recognition/default_test_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fd875a1e4926c8567a262c4148a9594db008134d --- /dev/null +++ b/automatic-speech-recognition/default_test_config.json @@ -0,0 +1,188 @@ +{ + "name": "default_test_config", + "n_gpu": 1, + "text_encoder": { + "type": "CTCCharTextEncoder", + "args": { + "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa", + "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt" + } + }, + "preprocessing": { + "sr": 16000, + "spectrogram": { + "type": "MelSpectrogram", + "args": { + "n_mels": 256 + } + }, + "log_spec": true + }, + "augmentations": { + "random_apply_p": 0.6, + "wave": [ + { + "type": "AddColoredNoise", + "args": { + "p": 1, + "sample_rate": 16000 + } + }, + { + "type": "Gain", + "args": { + "p": 0.8, + "sample_rate": 16000 + } + }, + { + "type": "HighPassFilter", + "args": { + "p": 0, + "sample_rate": 16000 + } + }, + { + "type": "LowPassFilter", + "args": { + "p": 0, + "sample_rate": 16000 + } + }, + { + "type": "PitchShift", + "args": { + "p": 0.8, + "min_transpose_semitones": -2, + "max_transpose_semitones": 2, + "sample_rate": 16000 + } + }, + { + "type": "PolarityInversion", + "args": { + "p": 0.8, + "sample_rate": 16000 + } + }, + { + "type": "Shift", + "args": { + "p": 0.8, + "sample_rate": 16000 + } + } + ], + "spectrogram": [ + { + "type": "TimeMasking", + "args": { + "time_mask_param": 80, + "p": 0.05 + } + }, + { + "type": "FrequencyMasking", + "args": { + "freq_mask_param": 80 + } + } + ] + }, + "arch": { + "type": "DeepSpeech2Model", + "args": { + "n_feats": 256, + "n_rnn_layers": 6, + "rnn_hidden_size": 512, + "rnn_dropout": 0.2 + } + }, + "data": { + "test": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-other" + } + } + ] + } + }, + "optimizer": { + "type": "AdamW", + "args": { + "lr": 0.0003, + "weight_decay": 1e-05 + } + }, + "loss": { + "type": "CTCLoss", + "args": {} + }, + "metrics": [ + { + "type": "ArgmaxWERMetric", + "args": { + "name": "WER (argmax)" + } + }, + { + "type": "ArgmaxCERMetric", + "args": { + "name": "CER (argmax)" + } + }, + { + "type": "BeamSearchWERMetric", + "args": { + "beam_size": 4, + "name": "WER (beam search)" + } + }, + { + "type": "BeamSearchCERMetric", + "args": { + "beam_size": 4, + "name": "CER (beam search)" + } + }, + { + "type": "LanguageModelWERMetric", + "args": { + "name": "WER (LM)" + } + }, + { + "type": "LanguageModelCERMetric", + "args": { + "name": "CER (LM)" + } + } + ], + "lr_scheduler": { + "type": "OneCycleLR", + "args": { + "steps_per_epoch": 1000, + "epochs": 50, + "anneal_strategy": "cos", + "max_lr": 0.0003, + "pct_start": 0.1 + } + }, + "trainer": { + "epochs": 50, + "save_dir": "saved/", + "save_period": 5, + "verbosity": 2, + "monitor": "min val_loss", + "early_stop": 100, + "visualize": "wandb", + "wandb_project": "asr_project", + "len_epoch": 1000, + "grad_norm_clip": 10 + } +} \ No newline at end of file diff --git a/automatic-speech-recognition/default_test_model/config.json b/automatic-speech-recognition/default_test_model/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e612026c1295aff58f8af408041e8178a8d7893b --- /dev/null +++ b/automatic-speech-recognition/default_test_model/config.json @@ -0,0 +1,242 @@ +{ + "name": "default_config", + "n_gpu": 1, + "text_encoder": { + "type": "CTCCharTextEncoder", + "args": { + "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa", + "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt" + } + }, + "preprocessing": { + "sr": 16000, + "spectrogram": { + "type": "MelSpectrogram", + "args": { + "n_mels": 256 + } + }, + "log_spec": true + }, + "augmentations": { + "random_apply_p": 0.6, + "wave": [ + { + "type": "AddColoredNoise", + "args": { + "p": 1, + "sample_rate": 16000 + } + }, + { + "type": "Gain", + "args": { + "p": 0.8, + "sample_rate": 16000 + } + }, + { + "type": "HighPassFilter", + "args": { + "p": 0, + "sample_rate": 16000 + } + }, + { + "type": "LowPassFilter", + "args": { + "p": 0, + "sample_rate": 16000 + } + }, + { + "type": "PitchShift", + "args": { + "p": 0.8, + "min_transpose_semitones": -2, + "max_transpose_semitones": 2, + "sample_rate": 16000 + } + }, + { + "type": "PolarityInversion", + "args": { + "p": 0.8, + "sample_rate": 16000 + } + }, + { + "type": "Shift", + "args": { + "p": 0.8, + "sample_rate": 16000 + } + } + ], + "spectrogram": [ + { + "type": "TimeMasking", + "args": { + "time_mask_param": 80, + "p": 0.05 + } + }, + { + "type": "FrequencyMasking", + "args": { + "freq_mask_param": 80 + } + } + ] + }, + "arch": { + "type": "DeepSpeech2Model", + "args": { + "n_feats": 256, + "n_rnn_layers": 6, + "rnn_hidden_size": 512, + "rnn_dropout": 0.2 + } + }, + "data": { + "train": { + "batch_size": 128, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "train-clean-100", + "max_audio_length": 40.0, + "max_text_length": 400 + } + }, + { + "type": "LibrispeechDataset", + "args": { + "part": "train-clean-360", + "max_audio_length": 40.0, + "max_text_length": 400 + } + }, + { + "type": "LibrispeechDataset", + "args": { + "part": "train-other-500", + "max_audio_length": 40.0, + "max_text_length": 400 + } + } + ] + }, + "val": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean" + } + } + ] + }, + "test-other": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-other" + } + } + ] + }, + "test-clean": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-clean" + } + } + ] + } + }, + "optimizer": { + "type": "AdamW", + "args": { + "lr": 0.0003, + "weight_decay": 1e-05 + } + }, + "loss": { + "type": "CTCLoss", + "args": {} + }, + "metrics": [ + { + "type": "ArgmaxWERMetric", + "args": { + "name": "WER (argmax)" + } + }, + { + "type": "ArgmaxCERMetric", + "args": { + "name": "CER (argmax)" + } + }, + { + "type": "BeamSearchWERMetric", + "args": { + "beam_size": 4, + "name": "WER (beam search)" + } + }, + { + "type": "BeamSearchCERMetric", + "args": { + "beam_size": 4, + "name": "CER (beam search)" + } + }, + { + "type": "LanguageModelWERMetric", + "args": { + "name": "WER (LM)" + } + }, + { + "type": "LanguageModelCERMetric", + "args": { + "name": "CER (LM)" + } + } + ], + "lr_scheduler": { + "type": "OneCycleLR", + "args": { + "steps_per_epoch": 1000, + "epochs": 50, + "anneal_strategy": "cos", + "max_lr": 0.0003, + "pct_start": 0.1 + } + }, + "trainer": { + "epochs": 50, + "save_dir": "saved/", + "save_period": 5, + "verbosity": 2, + "monitor": "min val_loss", + "early_stop": 100, + "visualize": "wandb", + "wandb_project": "asr_project", + "len_epoch": 1000, + "grad_norm_clip": 10 + } +} \ No newline at end of file diff --git a/automatic-speech-recognition/hw_asr/__init__.py b/automatic-speech-recognition/hw_asr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ca1e3113f0f8798d4b118037aee5326ef566aa5 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cafeb295d339a2b7caa7ffe80b365cf5335278f9 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__init__.py b/automatic-speech-recognition/hw_asr/augmentations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b2633e40c374f42c7dd9abd385a2d51705b9aae --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/__init__.py @@ -0,0 +1,36 @@ +from collections.abc import Callable +from typing import List + +import hw_asr.augmentations.spectrogram_augmentations +import hw_asr.augmentations.wave_augmentations +from hw_asr.augmentations.random_choice import RandomChoice +from hw_asr.augmentations.sequential_random_apply import SequentialRandomApply +# from hw_asr.augmentations.sequential import SequentialAugmentation +# from hw_asr.augmentations.random_apply import RandomApply +from hw_asr.utils.parse_config import ConfigParser + + +def from_configs(configs: ConfigParser): + wave_augs = [] + if "augmentations" in configs.config and "wave" in configs.config["augmentations"]: + for aug_dict in configs.config["augmentations"]["wave"]: + wave_augs.append( + configs.init_obj(aug_dict, hw_asr.augmentations.wave_augmentations) + ) + + spec_augs = [] + if "augmentations" in configs.config and "spectrogram" in configs.config["augmentations"]: + for aug_dict in configs.config["augmentations"]["spectrogram"]: + spec_augs.append( + configs.init_obj(aug_dict, hw_asr.augmentations.spectrogram_augmentations) + ) + return _to_function(RandomChoice, wave_augs, configs.config["augmentations"]["random_apply_p"]), _to_function(SequentialRandomApply, spec_augs, configs.config["augmentations"]["random_apply_p"]) + + +def _to_function(random_type, augs_list: List[Callable], p: float): + if len(augs_list) == 0: + return None + elif len(augs_list) == 1: + return augs_list[0] + else: + return random_type(augs_list, p) diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e048fea1094265d8f745c5da034deacf07790a21 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c39bc9c81aa2a387d268dc520ffbda88d2f4b556 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cec98a3d5c51b722ca59bb176d5e2710a0815587 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..433fc8be1385f2385e5bb9ba597d8aeeb57e0f0e Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a4b3f20c5522ef5948529c2d32dbdebe209394d Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e93d7f7a13c35970041cee925d2f3c8cf005da0 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51312b70f24eb8e9ab260f403ddfa4d86bb1e579 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2fa1af459f593cdb6f33250dd04127f3a05cab79 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d92848f8e23af02631a05da4e4a6c8dc7134dc8f Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f95b354dbffddda7d240c8d02999a2d4cc9a63bb Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b9cc0898e03570248d616145595c89041cd0714 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/base.py b/automatic-speech-recognition/hw_asr/augmentations/base.py new file mode 100644 index 0000000000000000000000000000000000000000..026818f5dc83d8377dab75e64ed584ca24822563 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/base.py @@ -0,0 +1,6 @@ +from torch import Tensor + + +class AugmentationBase: + def __call__(self, data: Tensor) -> Tensor: + raise NotImplementedError() diff --git a/automatic-speech-recognition/hw_asr/augmentations/random_apply.py b/automatic-speech-recognition/hw_asr/augmentations/random_apply.py new file mode 100644 index 0000000000000000000000000000000000000000..985a63017a2665d28150bb83b5549877f52092cc --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/random_apply.py @@ -0,0 +1,16 @@ +import random +from typing import Callable +from torch import Tensor + + +class RandomApply: + def __init__(self, augmentation: Callable, p: float): + assert 0 <= p <= 1 + self.augmentation = augmentation + self.p = p + + def __call__(self, data: Tensor) -> Tensor: + if random.random() < self.p: + return self.augmentation(data) + else: + return data diff --git a/automatic-speech-recognition/hw_asr/augmentations/random_choice.py b/automatic-speech-recognition/hw_asr/augmentations/random_choice.py new file mode 100644 index 0000000000000000000000000000000000000000..043ff53d6c5cd5d6bc42378078dd4e992833773d --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/random_choice.py @@ -0,0 +1,17 @@ +from typing import List, Callable +from torch import Tensor +import random +from hw_asr.augmentations.base import AugmentationBase + + +class RandomChoice(AugmentationBase): + def __init__(self, augmentation_list: List[Callable], p: float): + self.augmentation_list = augmentation_list + self.p = p + + def __call__(self, data: Tensor) -> Tensor: + x = data + if random.random() < self.p: + augmentation = random.choice(self.augmentation_list) + x = augmentation(x) + return x \ No newline at end of file diff --git a/automatic-speech-recognition/hw_asr/augmentations/sequential.py b/automatic-speech-recognition/hw_asr/augmentations/sequential.py new file mode 100644 index 0000000000000000000000000000000000000000..b967a3b5390a7ab6cdb1e4fd9653cee96148a5f6 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/sequential.py @@ -0,0 +1,16 @@ +from typing import List, Callable + +from torch import Tensor + +from hw_asr.augmentations.base import AugmentationBase + + +class SequentialAugmentation(AugmentationBase): + def __init__(self, augmentation_list: List[Callable]): + self.augmentation_list = augmentation_list + + def __call__(self, data: Tensor) -> Tensor: + x = data + for augmentation in self.augmentation_list: + x = augmentation(x) + return x \ No newline at end of file diff --git a/automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py b/automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py new file mode 100644 index 0000000000000000000000000000000000000000..2dafca7bc00d33e5afd51e7a22530f674583d6e6 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py @@ -0,0 +1,17 @@ +from typing import List, Callable +from torch import Tensor +import random +from hw_asr.augmentations.base import AugmentationBase + + +class SequentialRandomApply(AugmentationBase): + def __init__(self, augmentation_list: List[Callable], p: float = 0.5): + self.augmentation_list = augmentation_list + self.p = p + + def __call__(self, data: Tensor) -> Tensor: + x = data + for augmentation in self.augmentation_list: + if random.random() < self.p: + x = augmentation(x) + return x \ No newline at end of file diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py new file mode 100644 index 0000000000000000000000000000000000000000..6a7f963e1fce42d769f0848af52782fdd7182cee --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py @@ -0,0 +1,11 @@ +from torch import Tensor +from hw_asr.augmentations.base import AugmentationBase +from torchaudio import transforms + + +class FrequencyMasking(AugmentationBase): + def __init__(self, *args, **kwargs): + self._aug = transforms.FrequencyMasking(*args, **kwargs) + + def __call__(self, spectogram: Tensor): + return self._aug(spectogram).squeeze(1) diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py new file mode 100644 index 0000000000000000000000000000000000000000..fc39a795698619269be7414221b630e8cd1d1f47 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py @@ -0,0 +1,11 @@ +from torch import Tensor +from hw_asr.augmentations.base import AugmentationBase +from torchaudio import transforms + + +class TimeMasking(AugmentationBase): + def __init__(self, *args, **kwargs): + self._aug = transforms.TimeMasking(*args, **kwargs) + + def __call__(self, spectogram: Tensor): + return self._aug(spectogram).squeeze(1) diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5f80941b9437c9cc495eb496f2c5a6580aa82e2e --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py @@ -0,0 +1,7 @@ +from hw_asr.augmentations.spectrogram_augmentations.TimeMasking import TimeMasking +from hw_asr.augmentations.spectrogram_augmentations.FrequencyMasking import FrequencyMasking + +__all__ = [ + "TimeMasking", + "FrequencyMasking" +] diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0cc3dddf41cd6c59b2dbe69861ae6a4a5f5a9b95 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76ef77a3f88d71c472fac33248e17ee455e75853 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d378ded94e862a2ed63a5e6e878210ecbcb61220 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..440c4cd03868c2210656a5ca815d6f0508adf612 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py new file mode 100644 index 0000000000000000000000000000000000000000..ff654a501f5d4cf5d66b3a8cde5c6698ab41741c --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py @@ -0,0 +1,13 @@ +import torch_audiomentations +from torch import Tensor + +from hw_asr.augmentations.base import AugmentationBase + + +class AddColoredNoise(AugmentationBase): + def __init__(self, *args, **kwargs): + self._aug = torch_audiomentations.AddColoredNoise(*args, **kwargs) + + def __call__(self, data: Tensor): + x = data.unsqueeze(1) + return self._aug(x).squeeze(1) diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py new file mode 100644 index 0000000000000000000000000000000000000000..88f3f6dbb3a817e0b404904419c4a6d8a3b36b26 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py @@ -0,0 +1,13 @@ +import torch_audiomentations +from torch import Tensor + +from hw_asr.augmentations.base import AugmentationBase + + +class Gain(AugmentationBase): + def __init__(self, *args, **kwargs): + self._aug = torch_audiomentations.Gain(*args, **kwargs) + + def __call__(self, data: Tensor): + x = data.unsqueeze(1) + return self._aug(x).squeeze(1) diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py new file mode 100644 index 0000000000000000000000000000000000000000..f5dde3a26748384cd5f441aada4276ab96ca8828 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py @@ -0,0 +1,13 @@ +import torch_audiomentations +from torch import Tensor + +from hw_asr.augmentations.base import AugmentationBase + + +class HighPassFilter(AugmentationBase): + def __init__(self, *args, **kwargs): + self._aug = torch_audiomentations.HighPassFilter(*args, **kwargs) + + def __call__(self, data: Tensor): + x = data.unsqueeze(1) + return self._aug(x).squeeze(1) diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py new file mode 100644 index 0000000000000000000000000000000000000000..7c5f58e514d3f675ab0c3532983c651dfa330de9 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py @@ -0,0 +1,13 @@ +import torch_audiomentations +from torch import Tensor + +from hw_asr.augmentations.base import AugmentationBase + + +class LowPassFilter(AugmentationBase): + def __init__(self, *args, **kwargs): + self._aug = torch_audiomentations.LowPassFilter(*args, **kwargs) + + def __call__(self, data: Tensor): + x = data.unsqueeze(1) + return self._aug(x).squeeze(1) diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py new file mode 100644 index 0000000000000000000000000000000000000000..4008d20f71c33d143f4baec310c30017e26bfed8 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py @@ -0,0 +1,13 @@ +import torch_audiomentations +from torch import Tensor + +from hw_asr.augmentations.base import AugmentationBase + + +class Padding(AugmentationBase): + def __init__(self, *args, **kwargs): + self._aug = torch_audiomentations.Padding(*args, **kwargs) + + def __call__(self, data: Tensor): + x = data.unsqueeze(1) + return self._aug(x).squeeze(1) diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py new file mode 100644 index 0000000000000000000000000000000000000000..d3cb67e5de1a4294f8e13ddcc5cda6e09465222d --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py @@ -0,0 +1,13 @@ +import torch_audiomentations +from torch import Tensor + +from hw_asr.augmentations.base import AugmentationBase + + +class PitchShift(AugmentationBase): + def __init__(self, *args, **kwargs): + self._aug = torch_audiomentations.PitchShift(*args, **kwargs) + + def __call__(self, data: Tensor): + x = data.unsqueeze(1) + return self._aug(x).squeeze(1) diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py new file mode 100644 index 0000000000000000000000000000000000000000..4d2788346731e47b07286e45ad09f454dee68b2a --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py @@ -0,0 +1,13 @@ +import torch_audiomentations +from torch import Tensor + +from hw_asr.augmentations.base import AugmentationBase + + +class PolarityInversion(AugmentationBase): + def __init__(self, *args, **kwargs): + self._aug = torch_audiomentations.PolarityInversion(*args, **kwargs) + + def __call__(self, data: Tensor): + x = data.unsqueeze(1) + return self._aug(x).squeeze(1) diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py new file mode 100644 index 0000000000000000000000000000000000000000..441b0187e569628a4136dbc8434d35c5028ddafb --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py @@ -0,0 +1,13 @@ +import torch_audiomentations +from torch import Tensor + +from hw_asr.augmentations.base import AugmentationBase + + +class Shift(AugmentationBase): + def __init__(self, *args, **kwargs): + self._aug = torch_audiomentations.Shift(*args, **kwargs) + + def __call__(self, data: Tensor): + x = data.unsqueeze(1) + return self._aug(x).squeeze(1) diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..746022b0d0066dc3365e007ecabc11394cc96fea --- /dev/null +++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py @@ -0,0 +1,19 @@ +from hw_asr.augmentations.wave_augmentations.AddColoredNoise import AddColoredNoise +from hw_asr.augmentations.wave_augmentations.Gain import Gain +from hw_asr.augmentations.wave_augmentations.HighPassFilter import HighPassFilter +from hw_asr.augmentations.wave_augmentations.LowPassFilter import LowPassFilter +# from hw_asr.augmentations.wave_augmentations.Padding import Padding +from hw_asr.augmentations.wave_augmentations.PitchShift import PitchShift +from hw_asr.augmentations.wave_augmentations.PolarityInversion import PolarityInversion +from hw_asr.augmentations.wave_augmentations.Shift import Shift + +__all__ = [ + "AddColoredNoise", + "Gain", + "HighPassFilter" + "LowPassFilter", + # "Padding", + "PitchShift", + "PolarityInversion", + "Shift" +] diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5abf2fea3297af819e28219eee23c7ea2523172 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a7f27d9b25f976858f42a165d93159ed9b5e980 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eacf781c8875384d6705e28caf69dced4c1e3851 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5f573057cde5c0e871f124da520b973ea342f9b Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cacf98fa7f2e359c5e8dd5ff0aeb5e0dea12ed4b Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..694cea18a30ed20a4c03eecab5dfe0c9fa017462 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6dc950d6ba96dcf65207c4a52db7d1fe6f2fa2be Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca73b85fcd8f67117072873212d58f2d38623c54 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c9e1a440d0212ea5727fc76bf5bfd43e1c3eed5 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a78ba0726524138caba8d386fbcb9c289c420b21 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..532ce7a9d1db241d58551cfd8a264cabeb37e2fc Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..856c09224efc085a52c5b7fba5f03eec8fa55fd5 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b12f5a231a91589d106a9975a078d2d9e69f2e32 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa26edeffe3a25565f267e982725d833111526b2 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea0479ecf98a0349b79e573e6e5942c58098a94a Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26e317d069404da3e0227a6fbb6ef4fddd1c21f9 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__init__.py b/automatic-speech-recognition/hw_asr/base/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d9d437ad774e9499e8246af6f3c1c76418f08f2c --- /dev/null +++ b/automatic-speech-recognition/hw_asr/base/__init__.py @@ -0,0 +1,2 @@ +from .base_model import * +from .base_trainer import * diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cc6d0a6618b95ee8453115bf03a0978c6f0bbbe Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b83fc5c17c5ec9e038940080c7164961347c604 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf2fd749340c6ee9f6055ea6a61458a3323a90f0 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2590b3a1f5fa9229702508e986aada96270edbcb Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1efeabd3851524fa7ddc3adc7372612c1c49a0f Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1faa58ee85ee1bc46f9474a12dee9316a3b88c83 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc77c11ae48002f45618f3393daf264a75870582 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..511cc81c3eb6a3f4d90529db1e2ce9abf2c0780a Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f267f0f66a2ad5e509efdcefd42c8e473db368f Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..388a00a9a3a4405d8c9e3015f57781d23815d0a9 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..abd1abd943d4cb76cd9e71048a191a815a8fe8c5 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f51c655392f4209de6c6ac4b8687672b104902eb Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/base/base_dataset.py b/automatic-speech-recognition/hw_asr/base/base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..2413c0139af9de3afe495d816c10d5fd5b6f6842 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/base/base_dataset.py @@ -0,0 +1,145 @@ +import logging +import random +from typing import List + +import numpy as np +import torch +import torchaudio +from torch import Tensor +from torch.utils.data import Dataset + +from hw_asr.base.base_text_encoder import BaseTextEncoder +from hw_asr.utils.parse_config import ConfigParser + +logger = logging.getLogger(__name__) + + +class BaseDataset(Dataset): + def __init__( + self, + index, + text_encoder: BaseTextEncoder, + config_parser: ConfigParser, + wave_augs=None, + spec_augs=None, + limit=None, + max_audio_length=None, + max_text_length=None, + ): + self.text_encoder = text_encoder + self.config_parser = config_parser + self.wave_augs = wave_augs + self.spec_augs = spec_augs + self.log_spec = config_parser["preprocessing"]["log_spec"] + + self._assert_index_is_valid(index) + index = self._filter_records_from_dataset(index, max_audio_length, max_text_length, limit) + # it's a good idea to sort index by audio length + # It would be easier to write length-based batch samplers later + index = self._sort_index(index) + self._index: List[dict] = index + + def __getitem__(self, ind): + data_dict = self._index[ind] + audio_path = data_dict["path"] + audio_wave = self.load_audio(audio_path) + audio_wave, audio_spec = self.process_wave(audio_wave) + return { + "audio": audio_wave, + "spectrogram": audio_spec, + "duration": audio_wave.size(1) / self.config_parser["preprocessing"]["sr"], + "text": data_dict["text"], + "text_encoded": self.text_encoder.encode(data_dict["text"]), + "audio_path": audio_path, + } + + @staticmethod + def _sort_index(index): + return sorted(index, key=lambda x: x["audio_len"]) + + def __len__(self): + return len(self._index) + + def load_audio(self, path): + audio_tensor, sr = torchaudio.load(path) + audio_tensor = audio_tensor[0:1, :] # remove all channels but the first + target_sr = self.config_parser["preprocessing"]["sr"] + if sr != target_sr: + audio_tensor = torchaudio.functional.resample(audio_tensor, sr, target_sr) + return audio_tensor + + def process_wave(self, audio_tensor_wave: Tensor): + with torch.no_grad(): + if self.wave_augs is not None: + audio_tensor_wave = self.wave_augs(audio_tensor_wave) + wave2spec = self.config_parser.init_obj( + self.config_parser["preprocessing"]["spectrogram"], + torchaudio.transforms, + ) + audio_tensor_spec = wave2spec(audio_tensor_wave) + if self.spec_augs is not None: + audio_tensor_spec = self.spec_augs(audio_tensor_spec) + if self.log_spec: + audio_tensor_spec = torch.log(audio_tensor_spec + 1e-5) + return audio_tensor_wave, audio_tensor_spec + + @staticmethod + def _filter_records_from_dataset( + index: list, max_audio_length, max_text_length, limit + ) -> list: + initial_size = len(index) + if max_audio_length is not None: + exceeds_audio_length = np.array([el["audio_len"] for el in index]) >= max_audio_length + _total = exceeds_audio_length.sum() + logger.info( + f"{_total} ({_total / initial_size:.1%}) records are longer then " + f"{max_audio_length} seconds. Excluding them." + ) + else: + exceeds_audio_length = False + + initial_size = len(index) + if max_text_length is not None: + exceeds_text_length = ( + np.array( + [len(BaseTextEncoder.normalize_text(el["text"])) for el in index] + ) + >= max_text_length + ) + _total = exceeds_text_length.sum() + logger.info( + f"{_total} ({_total / initial_size:.1%}) records are longer then " + f"{max_text_length} characters. Excluding them." + ) + else: + exceeds_text_length = False + + records_to_filter = exceeds_text_length | exceeds_audio_length + + if records_to_filter is not False and records_to_filter.any(): + _total = records_to_filter.sum() + index = [el for el, exclude in zip(index, records_to_filter) if not exclude] + logger.info( + f"Filtered {_total}({_total / initial_size:.1%}) records from dataset" + ) + + if limit is not None: + random.seed(42) # best seed for deep learning + random.shuffle(index) + index = index[:limit] + return index + + @staticmethod + def _assert_index_is_valid(index): + for entry in index: + assert "audio_len" in entry, ( + "Each dataset item should include field 'audio_len'" + " - duration of audio (in seconds)." + ) + assert "path" in entry, ( + "Each dataset item should include field 'path'" " - path to audio file." + ) + assert "text" in entry, ( + "Each dataset item should include field 'text'" + " - text transcription of the audio." + ) diff --git a/automatic-speech-recognition/hw_asr/base/base_metric.py b/automatic-speech-recognition/hw_asr/base/base_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..8db32942c9c8755e9405dab54f79d8f99763e518 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/base/base_metric.py @@ -0,0 +1,6 @@ +class BaseMetric: + def __init__(self, name=None, *args, **kwargs): + self.name = name if name is not None else type(self).__name__ + + def __call__(self, **batch): + raise NotImplementedError() diff --git a/automatic-speech-recognition/hw_asr/base/base_model.py b/automatic-speech-recognition/hw_asr/base/base_model.py new file mode 100644 index 0000000000000000000000000000000000000000..cfde50c8ba874baca4e66c9eb7b9b37f10f59abe --- /dev/null +++ b/automatic-speech-recognition/hw_asr/base/base_model.py @@ -0,0 +1,41 @@ +from abc import abstractmethod +from typing import Union + +import numpy as np +import torch.nn as nn +from torch import Tensor + + +class BaseModel(nn.Module): + """ + Base class for all models + """ + + def __init__(self, n_feats, n_class, **batch): + super().__init__() + + @abstractmethod + def forward(self, **batch) -> Union[Tensor, dict]: + """ + Forward pass logic. + Can return a torch.Tensor (it will be interpreted as logits) or a dict. + + :return: Model output + """ + raise NotImplementedError() + + def __str__(self): + """ + Model prints with number of trainable parameters + """ + model_parameters = filter(lambda p: p.requires_grad, self.parameters()) + params = sum([np.prod(p.size()) for p in model_parameters]) + return super().__str__() + "\nTrainable parameters: {}".format(params) + + def transform_input_lengths(self, input_lengths): + """ + Input length transformation function. + For example: if your NN transforms spectrogram of time-length `N` into an + output with time-length `N / 2`, then this function should return `input_lengths // 2` + """ + raise NotImplementedError() diff --git a/automatic-speech-recognition/hw_asr/base/base_text_encoder.py b/automatic-speech-recognition/hw_asr/base/base_text_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..412f8cdac2d37089710712727deda33d3ed04f69 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/base/base_text_encoder.py @@ -0,0 +1,25 @@ +import re +from typing import List, Union + +import numpy as np +from torch import Tensor + + +class BaseTextEncoder: + def encode(self, text) -> Tensor: + raise NotImplementedError() + + def decode(self, vector: Union[Tensor, np.ndarray, List[int]]): + raise NotImplementedError() + + def __len__(self): + raise NotImplementedError() + + def __getitem__(self, item: int) -> str: + raise NotImplementedError() + + @staticmethod + def normalize_text(text: str): + text = text.lower() + text = re.sub(r"[^a-z ]", "", text) + return text diff --git a/automatic-speech-recognition/hw_asr/base/base_trainer.py b/automatic-speech-recognition/hw_asr/base/base_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..dc4cf2d8a727a56c139c50b6e24d227242a16716 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/base/base_trainer.py @@ -0,0 +1,205 @@ +from abc import abstractmethod + +import torch +from numpy import inf + +from hw_asr.base import BaseModel +from hw_asr.logger import get_visualizer + + +class BaseTrainer: + """ + Base class for all trainers + """ + + def __init__(self, model: BaseModel, criterion, metrics, optimizer, config, device): + self.device = device + self.config = config + self.logger = config.get_logger("trainer", config["trainer"]["verbosity"]) + + self.model = model + self.criterion = criterion + self.metrics = metrics + self.optimizer = optimizer + + # for interrupt saving + self._last_epoch = 0 + + cfg_trainer = config["trainer"] + self.epochs = cfg_trainer["epochs"] + self.save_period = cfg_trainer["save_period"] + self.monitor = cfg_trainer.get("monitor", "off") + + # configuration to monitor model performance and save best + if self.monitor == "off": + self.mnt_mode = "off" + self.mnt_best = 0 + else: + self.mnt_mode, self.mnt_metric = self.monitor.split() + assert self.mnt_mode in ["min", "max"] + + self.mnt_best = inf if self.mnt_mode == "min" else -inf + self.early_stop = cfg_trainer.get("early_stop", inf) + if self.early_stop <= 0: + self.early_stop = inf + + self.start_epoch = 1 + + self.checkpoint_dir = config.save_dir + + # setup visualization writer instance + self.writer = get_visualizer( + config, self.logger, cfg_trainer["visualize"] + ) + + if config.resume is not None: + self._load_model(config.resume) + + @abstractmethod + def _train_epoch(self, epoch): + """ + Training logic for an epoch + + :param epoch: Current epoch number + """ + raise NotImplementedError() + + def train(self): + try: + self._train_process() + except KeyboardInterrupt as e: + self.logger.info("Saving model on keyboard interrupt") + self._save_checkpoint(self._last_epoch, save_best=False) + raise e + + def _train_process(self): + """ + Full training logic + """ + not_improved_count = 0 + for epoch in range(self.start_epoch, self.epochs + 1): + self._last_epoch = epoch + result = self._train_epoch(epoch) + + # save logged informations into log dict + log = {"epoch": epoch} + log.update(result) + + # print logged informations to the screen + for key, value in log.items(): + self.logger.info(" {:15s}: {}".format(str(key), value)) + + # evaluate model performance according to configured metric, + # save best checkpoint as model_best + best = False + if self.mnt_mode != "off": + try: + # check whether model performance improved or not, + # according to specified metric(mnt_metric) + if self.mnt_mode == "min": + improved = log[self.mnt_metric] <= self.mnt_best + elif self.mnt_mode == "max": + improved = log[self.mnt_metric] >= self.mnt_best + else: + improved = False + except KeyError: + self.logger.warning( + "Warning: Metric '{}' is not found. " + "Model performance monitoring is disabled.".format( + self.mnt_metric + ) + ) + self.mnt_mode = "off" + improved = False + + if improved: + self.mnt_best = log[self.mnt_metric] + not_improved_count = 0 + best = True + else: + not_improved_count += 1 + + if not_improved_count > self.early_stop: + self.logger.info( + "Validation performance didn't improve for {} epochs. " + "Training stops.".format(self.early_stop) + ) + break + + if epoch % self.save_period == 0 or best: + self._save_checkpoint(epoch, save_best=best, only_best=True) + + def _save_checkpoint(self, epoch, save_best=False, only_best=False): + """ + Saving checkpoints + + :param epoch: current epoch number + :param save_best: if True, rename the saved checkpoint to 'model_best.pth' + """ + arch = type(self.model).__name__ + state = { + "arch": arch, + "epoch": epoch, + "state_dict": self.model.state_dict(), + "optimizer": self.optimizer.state_dict(), + "monitor_best": self.mnt_best, + "config": self.config, + } + filename = str(self.checkpoint_dir / "checkpoint-epoch{}.pth".format(epoch)) + if not (only_best and save_best): + torch.save(state, filename) + self.logger.info("Saving checkpoint: {} ...".format(filename)) + if save_best: + best_path = str(self.checkpoint_dir / "model_best.pth") + torch.save(state, best_path) + self.logger.info("Saving current best: model_best.pth ...") + + def _load_model(self, resume_path): + """ + Resume from saved checkpoints + + :param resume_path: Checkpoint path to be resumed + """ + resume_path = str(resume_path) + self.logger.info("Loading model: {} ...".format(resume_path)) + checkpoint = torch.load(resume_path, self.device) + self.model.load_state_dict(checkpoint["state_dict"]) + + self.logger.info("Model loaded") + + + def _resume_checkpoint(self, resume_path): + """ + Resume from saved checkpoints + + :param resume_path: Checkpoint path to be resumed + """ + resume_path = str(resume_path) + self.logger.info("Loading checkpoint: {} ...".format(resume_path)) + checkpoint = torch.load(resume_path, self.device) + self.start_epoch = checkpoint["epoch"] + 1 + self.mnt_best = checkpoint["monitor_best"] + + # load architecture params from checkpoint. + if checkpoint["config"]["arch"] != self.config["arch"]: + self.logger.warning( + "Warning: Architecture configuration given in config file is different from that " + "of checkpoint. This may yield an exception while state_dict is being loaded." + ) + self.model.load_state_dict(checkpoint["state_dict"]) + + # load optimizer state from checkpoint only when optimizer type is not changed. + if ( + checkpoint["config"]["optimizer"] != self.config["optimizer"] or + checkpoint["config"]["lr_scheduler"] != self.config["lr_scheduler"] + ): + self.logger.warning( + "Warning: Optimizer or lr_scheduler given in config file is different " + "from that of checkpoint. Optimizer parameters not being resumed." + ) + else: + self.optimizer.load_state_dict(checkpoint["optimizer"]) + + self.logger.info( + "Checkpoint loaded. Resume training from epoch {}".format(self.start_epoch) + ) diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/__init__.py b/automatic-speech-recognition/hw_asr/batch_sampler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d69f87c4705d5f47a59728374b1c745ed1203ba --- /dev/null +++ b/automatic-speech-recognition/hw_asr/batch_sampler/__init__.py @@ -0,0 +1,5 @@ +from hw_asr.batch_sampler.group_sort_batch_sampler import GroupLengthBatchSampler + +__all__ = [ + "GroupLengthBatchSampler" +] diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5489de66564ec3f47489b6dc03aec09f670aa3f4 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..866c6c3e573a34c53169ee657ad41bda2c3ff9b9 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-310.pyc b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df56ba062aa68a32c4d83a95073575f66ba73eba Binary files /dev/null and b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-311.pyc b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18f791f450914ce571802475392d8eb608f56689 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/group_sort_batch_sampler.py b/automatic-speech-recognition/hw_asr/batch_sampler/group_sort_batch_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..3b230af74e81f4c88638ef8e5b1096d517dbab36 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/batch_sampler/group_sort_batch_sampler.py @@ -0,0 +1,14 @@ +from torch.utils.data import Sampler + + +class GroupLengthBatchSampler(Sampler): + def __init__(self, data_source, batch_size, batches_per_group=20): + super().__init__(data_source) + # TODO: your code here (optional) + raise NotImplementedError() + + def __iter__(self): + raise NotImplementedError() + + def __len__(self): + raise NotImplementedError() diff --git a/automatic-speech-recognition/hw_asr/collate_fn/__init__.py b/automatic-speech-recognition/hw_asr/collate_fn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a5a841043471e1d71b90aae86ecb4c570b5ed5b Binary files /dev/null and b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..efcda68a7faae0a26c8a25e705f5e4722ed0380c Binary files /dev/null and b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-310.pyc b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6b8d6c932af7bbec80e14f58687890a36d0e420 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-311.pyc b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49ac11b9328aef3caf8a8de77cd151140234a7f7 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/collate_fn/collate.py b/automatic-speech-recognition/hw_asr/collate_fn/collate.py new file mode 100644 index 0000000000000000000000000000000000000000..17a56b545e34aabfe02b07e8e5efc5dcd4664422 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/collate_fn/collate.py @@ -0,0 +1,46 @@ +import torch +import logging +from typing import List + +logger = logging.getLogger(__name__) + + +def collate_fn(dataset_items: List[dict]): + """ + Collate and pad fields in dataset items + """ + # TODO: your code here + feature_length_dim = dataset_items[0]["spectrogram"].shape[1] + time_dim = max(dataset_items, key=lambda item: item["spectrogram"].shape[2])["spectrogram"].shape[2] + spectrogram = torch.zeros((len(dataset_items), feature_length_dim, time_dim)) + spectrogram_length = [] + + text_length_dim = max(dataset_items, key=lambda item: item["text_encoded"].shape[1])["text_encoded"].shape[1] + text_encoded = torch.zeros((len(dataset_items), text_length_dim)) + text_encoded_length = [] + text = [] + + audio_path = [] + audio = [] + for i, item in enumerate(dataset_items): + cur_time_dim = item["spectrogram"].shape[2] + spectrogram[i] = torch.cat([item["spectrogram"][0], torch.zeros((feature_length_dim, time_dim - cur_time_dim))], axis=1) + spectrogram_length.append(cur_time_dim) + + cur_text_length_dim = item["text_encoded"].shape[1] + text_encoded[i] = torch.cat([item["text_encoded"][0], torch.zeros(text_length_dim - cur_text_length_dim)]) + text_encoded_length.append(cur_text_length_dim) + text.append(item["text"]) + + audio_path.append(item["audio_path"]) + audio.append(item["audio"]) + + return { + "spectrogram": spectrogram, + "spectrogram_length": torch.Tensor(spectrogram_length).to(torch.int32), + "text_encoded": text_encoded, + "text_encoded_length": torch.Tensor(text_encoded_length).to(torch.int32), + "text": text, + "audio_path": audio_path, + "audio": audio, + } diff --git a/automatic-speech-recognition/hw_asr/configs/config.json b/automatic-speech-recognition/hw_asr/configs/config.json new file mode 100644 index 0000000000000000000000000000000000000000..47416448a705ad259521dba01323a9fd64691355 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/configs/config.json @@ -0,0 +1,184 @@ +{ + "name": "default_config", + "n_gpu": 1, + "text_encoder": { + "type": "CTCCharTextEncoder", + "args": { + "kenlm_model_path": "hw_asr/text_encoder/3-gram.arpa", + "unigrams_path": "hw_asr/text_encoder/librispeech-vocab.txt" + } + }, + "preprocessing": { + "sr": 16000, + "spectrogram": { + "type": "MelSpectrogram", + "args": { + "n_mels": 256 + } + }, + "log_spec": true + }, + "augmentations": { + "random_apply_p": 0.6, + "wave": [ + {"type": "AddColoredNoise", "args": {"p": 1, "sample_rate": 16000}}, + {"type": "Gain", "args": {"p": 0.8, "sample_rate": 16000}}, + {"type": "HighPassFilter", "args": {"p": 0, "sample_rate": 16000}}, + {"type": "LowPassFilter", "args": {"p": 0, "sample_rate": 16000}}, + {"type": "PitchShift", "args": {"p": 0.8, "min_transpose_semitones": -2, "max_transpose_semitones": 2, "sample_rate": 16000}}, + {"type": "PolarityInversion", "args": {"p": 0.8, "sample_rate": 16000}}, + {"type": "Shift", "args": {"p": 0.8, "sample_rate": 16000}} + ], + "spectrogram": [] + }, + "arch": { + "type": "DeepSpeech2Model", + "args": { + "n_feats": 256, + "n_rnn_layers": 5, + "rnn_hidden_size": 512, + "rnn_dropout": 0.2 + } + }, + "data": { + "train": { + "batch_size": 128, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "train-clean-100", + "max_audio_length": 40.0, + "max_text_length": 400 + } + }, + { + "type": "LibrispeechDataset", + "args": { + "part": "train-clean-360", + "max_audio_length": 40.0, + "max_text_length": 400 + } + }, + { + "type": "LibrispeechDataset", + "args": { + "part": "train-other-500", + "max_audio_length": 40.0, + "max_text_length": 400 + } + } + ] + }, + "val": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean" + } + } + ] + }, + "test-other": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-other" + } + } + ] + }, + "test-clean": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-clean" + } + } + ] + } + }, + "optimizer": { + "type": "AdamW", + "args": { + "lr": 5e-4, + "weight_decay": 1e-3 + } + }, + "loss": { + "type": "CTCLoss", + "args": {} + }, + "metrics": [ + { + "type": "ArgmaxWERMetric", + "args": { + "name": "WER (argmax)" + } + }, + { + "type": "ArgmaxCERMetric", + "args": { + "name": "CER (argmax)" + } + }, + { + "type": "BeamSearchWERMetric", + "args": { + "beam_size": 4, + "name": "WER (beam search)" + } + }, + { + "type": "BeamSearchCERMetric", + "args": { + "beam_size": 4, + "name": "CER (beam search)" + } + }, + { + "type": "LanguageModelWERMetric", + "args": { + "name": "WER (LM)" + } + }, + { + "type": "LanguageModelCERMetric", + "args": { + "name": "CER (LM)" + } + } + ], + "lr_scheduler": { + "type": "OneCycleLR", + "args": { + "steps_per_epoch": 1000, + "epochs": 50, + "anneal_strategy": "cos", + "max_lr": 5e-4, + "pct_start": 0.1 + } + }, + "trainer": { + "epochs": 50, + "save_dir": "saved/", + "save_period": 5, + "verbosity": 2, + "monitor": "min val_loss", + "early_stop": 100, + "visualize": "wandb", + "wandb_project": "asr_project", + "len_epoch": 1000, + "grad_norm_clip": 10 + } +} diff --git a/automatic-speech-recognition/hw_asr/configs/config1.json b/automatic-speech-recognition/hw_asr/configs/config1.json new file mode 100644 index 0000000000000000000000000000000000000000..47416448a705ad259521dba01323a9fd64691355 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/configs/config1.json @@ -0,0 +1,184 @@ +{ + "name": "default_config", + "n_gpu": 1, + "text_encoder": { + "type": "CTCCharTextEncoder", + "args": { + "kenlm_model_path": "hw_asr/text_encoder/3-gram.arpa", + "unigrams_path": "hw_asr/text_encoder/librispeech-vocab.txt" + } + }, + "preprocessing": { + "sr": 16000, + "spectrogram": { + "type": "MelSpectrogram", + "args": { + "n_mels": 256 + } + }, + "log_spec": true + }, + "augmentations": { + "random_apply_p": 0.6, + "wave": [ + {"type": "AddColoredNoise", "args": {"p": 1, "sample_rate": 16000}}, + {"type": "Gain", "args": {"p": 0.8, "sample_rate": 16000}}, + {"type": "HighPassFilter", "args": {"p": 0, "sample_rate": 16000}}, + {"type": "LowPassFilter", "args": {"p": 0, "sample_rate": 16000}}, + {"type": "PitchShift", "args": {"p": 0.8, "min_transpose_semitones": -2, "max_transpose_semitones": 2, "sample_rate": 16000}}, + {"type": "PolarityInversion", "args": {"p": 0.8, "sample_rate": 16000}}, + {"type": "Shift", "args": {"p": 0.8, "sample_rate": 16000}} + ], + "spectrogram": [] + }, + "arch": { + "type": "DeepSpeech2Model", + "args": { + "n_feats": 256, + "n_rnn_layers": 5, + "rnn_hidden_size": 512, + "rnn_dropout": 0.2 + } + }, + "data": { + "train": { + "batch_size": 128, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "train-clean-100", + "max_audio_length": 40.0, + "max_text_length": 400 + } + }, + { + "type": "LibrispeechDataset", + "args": { + "part": "train-clean-360", + "max_audio_length": 40.0, + "max_text_length": 400 + } + }, + { + "type": "LibrispeechDataset", + "args": { + "part": "train-other-500", + "max_audio_length": 40.0, + "max_text_length": 400 + } + } + ] + }, + "val": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean" + } + } + ] + }, + "test-other": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-other" + } + } + ] + }, + "test-clean": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-clean" + } + } + ] + } + }, + "optimizer": { + "type": "AdamW", + "args": { + "lr": 5e-4, + "weight_decay": 1e-3 + } + }, + "loss": { + "type": "CTCLoss", + "args": {} + }, + "metrics": [ + { + "type": "ArgmaxWERMetric", + "args": { + "name": "WER (argmax)" + } + }, + { + "type": "ArgmaxCERMetric", + "args": { + "name": "CER (argmax)" + } + }, + { + "type": "BeamSearchWERMetric", + "args": { + "beam_size": 4, + "name": "WER (beam search)" + } + }, + { + "type": "BeamSearchCERMetric", + "args": { + "beam_size": 4, + "name": "CER (beam search)" + } + }, + { + "type": "LanguageModelWERMetric", + "args": { + "name": "WER (LM)" + } + }, + { + "type": "LanguageModelCERMetric", + "args": { + "name": "CER (LM)" + } + } + ], + "lr_scheduler": { + "type": "OneCycleLR", + "args": { + "steps_per_epoch": 1000, + "epochs": 50, + "anneal_strategy": "cos", + "max_lr": 5e-4, + "pct_start": 0.1 + } + }, + "trainer": { + "epochs": 50, + "save_dir": "saved/", + "save_period": 5, + "verbosity": 2, + "monitor": "min val_loss", + "early_stop": 100, + "visualize": "wandb", + "wandb_project": "asr_project", + "len_epoch": 1000, + "grad_norm_clip": 10 + } +} diff --git a/automatic-speech-recognition/hw_asr/configs/config2.json b/automatic-speech-recognition/hw_asr/configs/config2.json new file mode 100644 index 0000000000000000000000000000000000000000..166d5dfe9997d27fe055bff049f8621267f5b7dd --- /dev/null +++ b/automatic-speech-recognition/hw_asr/configs/config2.json @@ -0,0 +1,189 @@ +{ + "name": "default_config", + "n_gpu": 1, + "text_encoder": { + "type": "CTCCharTextEncoder", + "args": { + "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa", + "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt" + } + }, + "preprocessing": { + "sr": 16000, + "spectrogram": { + "type": "MelSpectrogram", + "args": { + "n_mels": 256 + } + }, + "log_spec": true + }, + "augmentations": { + "random_apply_p": 0.6, + "wave": [ + {"type": "AddColoredNoise", "args": {"p": 1, "sample_rate": 16000}}, + {"type": "Gain", "args": {"p": 0.8, "sample_rate": 16000}}, + {"type": "HighPassFilter", "args": {"p": 0, "sample_rate": 16000}}, + {"type": "LowPassFilter", "args": {"p": 0, "sample_rate": 16000}}, + {"type": "PitchShift", "args": {"p": 0.8, "min_transpose_semitones": -2, "max_transpose_semitones": 2, "sample_rate": 16000}}, + {"type": "PolarityInversion", "args": {"p": 0.8, "sample_rate": 16000}}, + {"type": "Shift", "args": {"p": 0.8, "sample_rate": 16000}} + ], + "spectrogram": [ + {"type": "TimeMasking", "args": {"time_mask_param": 80, "p": 0.05}}, + {"type": "TimeMasking", "args": {"time_mask_param": 80, "p": 0.05}}, + {"type": "TimeMasking", "args": {"time_mask_param": 80, "p": 0.05}}, + {"type": "FrequencyMasking", "args": {"freq_mask_param": 80}} + ] + }, + "arch": { + "type": "DeepSpeech2Model", + "args": { + "n_feats": 256, + "n_rnn_layers": 6, + "rnn_hidden_size": 512, + "rnn_dropout": 0.2 + } + }, + "data": { + "train": { + "batch_size": 128, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "train-clean-100", + "max_audio_length": 40.0, + "max_text_length": 400 + } + }, + { + "type": "LibrispeechDataset", + "args": { + "part": "train-clean-360", + "max_audio_length": 40.0, + "max_text_length": 400 + } + }, + { + "type": "LibrispeechDataset", + "args": { + "part": "train-other-500", + "max_audio_length": 40.0, + "max_text_length": 400 + } + } + ] + }, + "val": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean" + } + } + ] + }, + "test-other": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-other" + } + } + ] + }, + "test-clean": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-clean" + } + } + ] + } + }, + "optimizer": { + "type": "AdamW", + "args": { + "lr": 3e-4, + "weight_decay": 1e-5 + } + }, + "loss": { + "type": "CTCLoss", + "args": {} + }, + "metrics": [ + { + "type": "ArgmaxWERMetric", + "args": { + "name": "WER (argmax)" + } + }, + { + "type": "ArgmaxCERMetric", + "args": { + "name": "CER (argmax)" + } + }, + { + "type": "BeamSearchWERMetric", + "args": { + "beam_size": 4, + "name": "WER (beam search)" + } + }, + { + "type": "BeamSearchCERMetric", + "args": { + "beam_size": 4, + "name": "CER (beam search)" + } + }, + { + "type": "LanguageModelWERMetric", + "args": { + "name": "WER (LM)" + } + }, + { + "type": "LanguageModelCERMetric", + "args": { + "name": "CER (LM)" + } + } + ], + "lr_scheduler": { + "type": "OneCycleLR", + "args": { + "steps_per_epoch": 1000, + "epochs": 50, + "anneal_strategy": "cos", + "max_lr": 3e-4, + "pct_start": 0.1 + } + }, + "trainer": { + "epochs": 50, + "save_dir": "saved/", + "save_period": 5, + "verbosity": 2, + "monitor": "min val_loss", + "early_stop": 100, + "visualize": "wandb", + "wandb_project": "asr_project", + "len_epoch": 1000, + "grad_norm_clip": 10 + } +} diff --git a/automatic-speech-recognition/hw_asr/configs/config_clean.json b/automatic-speech-recognition/hw_asr/configs/config_clean.json new file mode 100644 index 0000000000000000000000000000000000000000..47416448a705ad259521dba01323a9fd64691355 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/configs/config_clean.json @@ -0,0 +1,184 @@ +{ + "name": "default_config", + "n_gpu": 1, + "text_encoder": { + "type": "CTCCharTextEncoder", + "args": { + "kenlm_model_path": "hw_asr/text_encoder/3-gram.arpa", + "unigrams_path": "hw_asr/text_encoder/librispeech-vocab.txt" + } + }, + "preprocessing": { + "sr": 16000, + "spectrogram": { + "type": "MelSpectrogram", + "args": { + "n_mels": 256 + } + }, + "log_spec": true + }, + "augmentations": { + "random_apply_p": 0.6, + "wave": [ + {"type": "AddColoredNoise", "args": {"p": 1, "sample_rate": 16000}}, + {"type": "Gain", "args": {"p": 0.8, "sample_rate": 16000}}, + {"type": "HighPassFilter", "args": {"p": 0, "sample_rate": 16000}}, + {"type": "LowPassFilter", "args": {"p": 0, "sample_rate": 16000}}, + {"type": "PitchShift", "args": {"p": 0.8, "min_transpose_semitones": -2, "max_transpose_semitones": 2, "sample_rate": 16000}}, + {"type": "PolarityInversion", "args": {"p": 0.8, "sample_rate": 16000}}, + {"type": "Shift", "args": {"p": 0.8, "sample_rate": 16000}} + ], + "spectrogram": [] + }, + "arch": { + "type": "DeepSpeech2Model", + "args": { + "n_feats": 256, + "n_rnn_layers": 5, + "rnn_hidden_size": 512, + "rnn_dropout": 0.2 + } + }, + "data": { + "train": { + "batch_size": 128, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "train-clean-100", + "max_audio_length": 40.0, + "max_text_length": 400 + } + }, + { + "type": "LibrispeechDataset", + "args": { + "part": "train-clean-360", + "max_audio_length": 40.0, + "max_text_length": 400 + } + }, + { + "type": "LibrispeechDataset", + "args": { + "part": "train-other-500", + "max_audio_length": 40.0, + "max_text_length": 400 + } + } + ] + }, + "val": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean" + } + } + ] + }, + "test-other": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-other" + } + } + ] + }, + "test-clean": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-clean" + } + } + ] + } + }, + "optimizer": { + "type": "AdamW", + "args": { + "lr": 5e-4, + "weight_decay": 1e-3 + } + }, + "loss": { + "type": "CTCLoss", + "args": {} + }, + "metrics": [ + { + "type": "ArgmaxWERMetric", + "args": { + "name": "WER (argmax)" + } + }, + { + "type": "ArgmaxCERMetric", + "args": { + "name": "CER (argmax)" + } + }, + { + "type": "BeamSearchWERMetric", + "args": { + "beam_size": 4, + "name": "WER (beam search)" + } + }, + { + "type": "BeamSearchCERMetric", + "args": { + "beam_size": 4, + "name": "CER (beam search)" + } + }, + { + "type": "LanguageModelWERMetric", + "args": { + "name": "WER (LM)" + } + }, + { + "type": "LanguageModelCERMetric", + "args": { + "name": "CER (LM)" + } + } + ], + "lr_scheduler": { + "type": "OneCycleLR", + "args": { + "steps_per_epoch": 1000, + "epochs": 50, + "anneal_strategy": "cos", + "max_lr": 5e-4, + "pct_start": 0.1 + } + }, + "trainer": { + "epochs": 50, + "save_dir": "saved/", + "save_period": 5, + "verbosity": 2, + "monitor": "min val_loss", + "early_stop": 100, + "visualize": "wandb", + "wandb_project": "asr_project", + "len_epoch": 1000, + "grad_norm_clip": 10 + } +} diff --git a/automatic-speech-recognition/hw_asr/configs/finetune.json b/automatic-speech-recognition/hw_asr/configs/finetune.json new file mode 100644 index 0000000000000000000000000000000000000000..e95bad1543fc464fc8fa89eb62ebc63cdd3d519a --- /dev/null +++ b/automatic-speech-recognition/hw_asr/configs/finetune.json @@ -0,0 +1,122 @@ +{ + "name": "default_config", + "n_gpu": 1, + "text_encoder": { + "type": "CTCCharTextEncoder", + "args": { + "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa", + "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt" + } + }, + "preprocessing": { + "sr": 16000, + "spectrogram": { + "type": "MelSpectrogram", + "args": { + "n_mels": 256 + } + }, + "log_spec": true + }, + "augmentations": { + "random_apply_p": 0, + "wave": [], + "spectrogram": [] + }, + "arch": { + "type": "DeepSpeech2Model", + "args": { + "n_feats": 256, + "n_rnn_layers": 6, + "rnn_hidden_size": 512, + "rnn_dropout": 0.2 + } + }, + "data": { + "train": { + "batch_size": 128, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "train-other-500", + "max_audio_length": 40.0, + "max_text_length": 400 + } + } + ] + }, + "val": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean" + } + } + ] + }, + "test-other": { + "batch_size": 64, + "num_workers": 4, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "test-other" + } + } + ] + } + }, + "optimizer": { + "type": "AdamW", + "args": { + "lr": 6e-5, + "weight_decay": 1e-5 + } + }, + "loss": { + "type": "CTCLoss", + "args": {} + }, + "metrics": [ + { + "type": "ArgmaxWERMetric", + "args": { + "name": "WER (argmax)" + } + }, + { + "type": "ArgmaxCERMetric", + "args": { + "name": "CER (argmax)" + } + } + ], + "lr_scheduler": { + "type": "OneCycleLR", + "args": { + "steps_per_epoch": 1000, + "epochs": 10, + "anneal_strategy": "cos", + "max_lr": 6e-5, + "pct_start": 0.2 + } + }, + "trainer": { + "epochs": 10, + "save_dir": "saved/", + "save_period": 5, + "verbosity": 2, + "monitor": "min val_loss", + "early_stop": 100, + "visualize": "wandb", + "wandb_project": "asr_project", + "len_epoch": 1000, + "grad_norm_clip": 10 + } +} diff --git a/automatic-speech-recognition/hw_asr/configs/one_batch_test_baseline.json b/automatic-speech-recognition/hw_asr/configs/one_batch_test_baseline.json new file mode 100644 index 0000000000000000000000000000000000000000..c1d5502d2f43360646306f8faf9e2271f171ba97 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/configs/one_batch_test_baseline.json @@ -0,0 +1,102 @@ +{ + "name": "one_batch_test", + "n_gpu": 1, + "preprocessing": { + "sr": 16000, + "spectrogram": { + "type": "MelSpectrogram", + "args": { + } + }, + "log_spec": true + }, + "augmentations": { + "wave": [], + "spectrogram": [] + }, + "arch": { + "type": "BaselineModel", + "args": { + "n_feats": 128, + "fc_hidden": 512 + } + }, + "data": { + "train": { + "batch_size": 10, + "num_workers": 0, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean", + "max_audio_length": 20.0, + "max_text_length": 200, + "limit": 10 + } + } + ] + }, + "val": { + "batch_size": 10, + "num_workers": 0, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean", + "max_audio_length": 20.0, + "max_text_length": 200, + "limit": 10 + } + } + ] + } + }, + "optimizer": { + "type": "SGD", + "args": { + "lr": 1e-2 + } + }, + "loss": { + "type": "CTCLoss", + "args": {} + }, + "metrics": [ + { + "type": "ArgmaxWERMetric", + "args": { + "name": "WER (argmax)" + } + }, + { + "type": "ArgmaxCERMetric", + "args": { + "name": "CER (argmax)" + } + } + ], + "lr_scheduler": { + "type": "OneCycleLR", + "args": { + "steps_per_epoch": 100, + "epochs": 50, + "anneal_strategy": "cos", + "max_lr": 1e-2, + "pct_start": 0.2 + } + }, + "trainer": { + "epochs": 50, + "save_dir": "saved/", + "save_period": 5, + "verbosity": 2, + "monitor": "min val_loss", + "early_stop": 100, + "visualize": "wandb", + "wandb_project": "asr_project_check", + "len_epoch": 100, + "grad_norm_clip": 10 + } +} diff --git a/automatic-speech-recognition/hw_asr/configs/one_batch_test_deepspeech2.json b/automatic-speech-recognition/hw_asr/configs/one_batch_test_deepspeech2.json new file mode 100644 index 0000000000000000000000000000000000000000..e9757e654a9b23286fc3f301859e18e73431b629 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/configs/one_batch_test_deepspeech2.json @@ -0,0 +1,133 @@ +{ + "name": "one_batch_test", + "n_gpu": 1, + "text_encoder": { + "type": "CTCCharTextEncoder", + "args": { + "kenlm_model_path": "hw_asr/text_encoder/3-gram.arpa", + "unigrams_path": "hw_asr/text_encoder/librispeech-vocab.txt" + } + }, + "preprocessing": { + "sr": 16000, + "spectrogram": { + "type": "MelSpectrogram", + "args": { + "n_mels": 512 + } + }, + "log_spec": true + }, + "augmentations": { + "wave": [ + {"type": "AddColoredNoise", "args": {"p": 0.3, "sample_rate": 16000}}, + {"type": "Gain", "args": {"p": 0.4, "sample_rate": 16000}}, + {"type": "HighPassFilter", "args": {"p": 0.3, "sample_rate": 16000}}, + {"type": "LowPassFilter", "args": {"p": 0.3, "sample_rate": 16000}}, + {"type": "PitchShift", "args": {"p": 0.3, "sample_rate": 16000}}, + {"type": "PolarityInversion", "args": {"p": 0, "sample_rate": 16000}}, + {"type": "Shift", "args": {"p": 0.2, "sample_rate": 16000}} + ], + "spectrogram": [] + }, + "arch": { + "type": "DeepSpeech2Model", + "args": { + "n_feats": 512, + "n_rnn_layers": 1, + "rnn_hidden_size": 256 + } + }, + "data": { + "train": { + "batch_size": 10, + "num_workers": 0, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean", + "max_audio_length": 20.0, + "max_text_length": 200, + "limit": 10 + } + } + ] + }, + "val": { + "batch_size": 10, + "num_workers": 0, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean", + "max_audio_length": 20.0, + "max_text_length": 200, + "limit": 10 + } + } + ] + } + }, + "optimizer": { + "type": "SGD", + "args": { + "lr": 1e-2 + } + }, + "loss": { + "type": "CTCLoss", + "args": {} + }, + "metrics": [ + { + "type": "ArgmaxWERMetric", + "args": { + "name": "WER (argmax)" + } + }, + { + "type": "ArgmaxCERMetric", + "args": { + "name": "CER (argmax)" + } + }, + { + "type": "BeamSearchWERMetric", + "args": { + "beam_size": 2, + "name": "WER (beam search)" + } + }, + { + "type": "BeamSearchCERMetric", + "args": { + "beam_size": 2, + "name": "CER (beam search)" + } + } + ], + "lr_scheduler": { + "type": "OneCycleLR", + "args": { + "steps_per_epoch": 100, + "epochs": 50, + "anneal_strategy": "cos", + "max_lr": 1e-2, + "pct_start": 0.2 + } + }, + "trainer": { + "epochs": 50, + "save_dir": "saved/", + "save_period": 5, + "verbosity": 2, + "monitor": "min val_loss", + "early_stop": 100, + "visualize": "wandb", + "wandb_project": "asr_project_check", + "len_epoch": 100, + "grad_norm_clip": 10 + } +} diff --git a/automatic-speech-recognition/hw_asr/datasets/__init__.py b/automatic-speech-recognition/hw_asr/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5ca1615b8c16bc4eaca881ed7e78ca0ed356fc6d --- /dev/null +++ b/automatic-speech-recognition/hw_asr/datasets/__init__.py @@ -0,0 +1,13 @@ +from hw_asr.datasets.custom_audio_dataset import CustomAudioDataset +from hw_asr.datasets.custom_dir_audio_dataset import CustomDirAudioDataset +from hw_asr.datasets.librispeech_dataset import LibrispeechDataset +from hw_asr.datasets.ljspeech_dataset import LJspeechDataset +from hw_asr.datasets.common_voice import CommonVoiceDataset + +__all__ = [ + "LibrispeechDataset", + "CustomDirAudioDataset", + "CustomAudioDataset", + "LJspeechDataset", + "CommonVoiceDataset" +] diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13a3597ba6f1585bda059a8e6fdd6a3eca8987e6 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51bb1449d7ef07d06dc61c7bd70baa27f6a9b327 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f6cecbad86b3231fb332d08e1d424f79577eded Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..664916dd2c7b9b06d740913d59b21dde0b8b5b7f Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba296c016e5b91a2715eb57aaf1022095ad9be38 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90fa083bca1abd0e782e41a5608a00b5b630139a Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe184ea7cbbb222cceec074c92b428dad3185903 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1919b37b6eea6f68978d885abe497317d1680d52 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc6fcea417299a09694842271701a5e48d31fbd7 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4edba4fb8ea5477a419c52ecee78181015cf8c0 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..420383191298c1f31fe7f40264420c3f3ebf8436 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27895d59d639b4dccf50b9f61ff595592d9b3bd5 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/datasets/common_voice.py b/automatic-speech-recognition/hw_asr/datasets/common_voice.py new file mode 100644 index 0000000000000000000000000000000000000000..da5bb92231cfc0a72ff14287bae742b9a648b4df --- /dev/null +++ b/automatic-speech-recognition/hw_asr/datasets/common_voice.py @@ -0,0 +1,45 @@ +import logging +from pathlib import Path +import json + +import torchaudio +from datasets import load_dataset +import re +from tqdm import tqdm + +from hw_asr.base.base_dataset import BaseDataset +from hw_asr.utils import ROOT_PATH + +logger = logging.getLogger(__name__) + + +class CommonVoiceDataset(BaseDataset): + def __init__(self, split, *args, **kwargs): + self._data_dir = ROOT_PATH / "dataset_common_voice" + self._regex = re.compile("[^a-z ]") + self._dataset = load_dataset("common_voice", "en", cache_dir=self._data_dir, split=split) + index = self._get_or_load_index(split) + super().__init__(index, *args, **kwargs) + + def _get_or_load_index(self, split): + index_path = self._data_dir / f"{split}_index.json" + if index_path.exists(): + with index_path.open() as f: + index = json.load(f) + else: + index = [] + for entry in tqdm(self._dataset): + assert "path" in entry + assert Path(entry["path"]).exists(), f"Path {entry['path']} doesn't exist" + entry["path"] = str(Path(entry["path"]).absolute().resolve()) + entry["text"] = self._regex.sub("", entry.get("sentence", "").lower()) + t_info = torchaudio.info(entry["path"]) + entry["audio_len"] = t_info.num_frames / t_info.sample_rate + index.append({ + "path": entry["path"], + "text": entry["text"], + "audio_len": entry["audio_len"], + }) + with index_path.open("w") as f: + json.dump(index, f, indent=2) + return index diff --git a/automatic-speech-recognition/hw_asr/datasets/custom_audio_dataset.py b/automatic-speech-recognition/hw_asr/datasets/custom_audio_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..80b582c87eb74f0dd762a7f55d18c7aacd692d60 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/datasets/custom_audio_dataset.py @@ -0,0 +1,22 @@ +import logging +from pathlib import Path + +import torchaudio + +from hw_asr.base.base_dataset import BaseDataset + +logger = logging.getLogger(__name__) + + +class CustomAudioDataset(BaseDataset): + def __init__(self, data, *args, **kwargs): + index = data + for entry in data: + assert "path" in entry + assert Path(entry["path"]).exists(), f"Path {entry['path']} doesn't exist" + entry["path"] = str(Path(entry["path"]).absolute().resolve()) + entry["text"] = entry.get("text", "") + t_info = torchaudio.info(entry["path"]) + entry["audio_len"] = t_info.num_frames / t_info.sample_rate + + super().__init__(index, *args, **kwargs) diff --git a/automatic-speech-recognition/hw_asr/datasets/custom_dir_audio_dataset.py b/automatic-speech-recognition/hw_asr/datasets/custom_dir_audio_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..084efa07289c1c6e574d92e331c59e6e6aee02d0 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/datasets/custom_dir_audio_dataset.py @@ -0,0 +1,23 @@ +import logging +from pathlib import Path + +from hw_asr.datasets.custom_audio_dataset import CustomAudioDataset + +logger = logging.getLogger(__name__) + + +class CustomDirAudioDataset(CustomAudioDataset): + def __init__(self, audio_dir, transcription_dir=None, *args, **kwargs): + data = [] + for path in Path(audio_dir).iterdir(): + entry = {} + if path.suffix in [".mp3", ".wav", ".flac", ".m4a"]: + entry["path"] = str(path) + if transcription_dir and Path(transcription_dir).exists(): + transc_path = Path(transcription_dir) / (path.stem + '.txt') + if transc_path.exists(): + with transc_path.open() as f: + entry["text"] = f.read().strip() + if len(entry) > 0: + data.append(entry) + super().__init__(data, *args, **kwargs) diff --git a/automatic-speech-recognition/hw_asr/datasets/librispeech_dataset.py b/automatic-speech-recognition/hw_asr/datasets/librispeech_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..ee0bc051a07e681aa495f33706273592ce703e77 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/datasets/librispeech_dataset.py @@ -0,0 +1,93 @@ +import json +import logging +import os +import shutil +from pathlib import Path + +import torchaudio +from speechbrain.utils.data_utils import download_file +from tqdm import tqdm + +from hw_asr.base.base_dataset import BaseDataset +from hw_asr.utils import ROOT_PATH + +logger = logging.getLogger(__name__) + +URL_LINKS = { + "dev-clean": "https://www.openslr.org/resources/12/dev-clean.tar.gz", + "dev-other": "https://www.openslr.org/resources/12/dev-other.tar.gz", + "test-clean": "https://www.openslr.org/resources/12/test-clean.tar.gz", + "test-other": "https://www.openslr.org/resources/12/test-other.tar.gz", + "train-clean-100": "https://www.openslr.org/resources/12/train-clean-100.tar.gz", + "train-clean-360": "https://www.openslr.org/resources/12/train-clean-360.tar.gz", + "train-other-500": "https://www.openslr.org/resources/12/train-other-500.tar.gz", +} + + +class LibrispeechDataset(BaseDataset): + def __init__(self, part, data_dir=None, *args, **kwargs): + assert part in URL_LINKS or part == 'train_all' + + if data_dir is None: + data_dir = ROOT_PATH / "data" / "datasets" / "librispeech" + data_dir.mkdir(exist_ok=True, parents=True) + self._data_dir = data_dir + if part == 'train_all': + index = sum([self._get_or_load_index(part) + for part in URL_LINKS if 'train' in part], []) + else: + index = self._get_or_load_index(part) + + super().__init__(index, *args, **kwargs) + + def _load_part(self, part): + arch_path = self._data_dir / f"{part}.tar.gz" + print(f"Loading part {part}") + download_file(URL_LINKS[part], arch_path) + shutil.unpack_archive(arch_path, self._data_dir) + for fpath in (self._data_dir / "LibriSpeech").iterdir(): + shutil.move(str(fpath), str(self._data_dir / fpath.name)) + os.remove(str(arch_path)) + shutil.rmtree(str(self._data_dir / "LibriSpeech")) + + def _get_or_load_index(self, part): + index_path = self._data_dir / f"{part}_index.json" + if index_path.exists(): + with index_path.open() as f: + index = json.load(f) + else: + index = self._create_index(part) + with index_path.open("w") as f: + json.dump(index, f, indent=2) + return index + + def _create_index(self, part): + index = [] + split_dir = self._data_dir / part + if not split_dir.exists(): + self._load_part(part) + + flac_dirs = set() + for dirpath, dirnames, filenames in os.walk(str(split_dir)): + if any([f.endswith(".flac") for f in filenames]): + flac_dirs.add(dirpath) + for flac_dir in tqdm( + list(flac_dirs), desc=f"Preparing librispeech folders: {part}" + ): + flac_dir = Path(flac_dir) + trans_path = list(flac_dir.glob("*.trans.txt"))[0] + with trans_path.open() as f: + for line in f: + f_id = line.split()[0] + f_text = " ".join(line.split()[1:]).strip() + flac_path = flac_dir / f"{f_id}.flac" + t_info = torchaudio.info(str(flac_path)) + length = t_info.num_frames / t_info.sample_rate + index.append( + { + "path": str(flac_path.absolute().resolve()), + "text": f_text.lower(), + "audio_len": length, + } + ) + return index diff --git a/automatic-speech-recognition/hw_asr/datasets/ljspeech_dataset.py b/automatic-speech-recognition/hw_asr/datasets/ljspeech_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..adf9430a17e1e81a5e14cbf63f948a6257813b9f --- /dev/null +++ b/automatic-speech-recognition/hw_asr/datasets/ljspeech_dataset.py @@ -0,0 +1,96 @@ +import json +import logging +import os +import shutil +from curses.ascii import isascii +from pathlib import Path + +import torchaudio +from hw_asr.base.base_dataset import BaseDataset +from hw_asr.utils import ROOT_PATH +from speechbrain.utils.data_utils import download_file +from tqdm import tqdm + +logger = logging.getLogger(__name__) + +URL_LINKS = { + "dataset": "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", +} + + +class LJspeechDataset(BaseDataset): + def __init__(self, part, data_dir=None, *args, **kwargs): + if data_dir is None: + data_dir = ROOT_PATH / "data" / "datasets" / "ljspeech" + data_dir.mkdir(exist_ok=True, parents=True) + self._data_dir = data_dir + index = self._get_or_load_index(part) + + super().__init__(index, *args, **kwargs) + + def _load_dataset(self): + arch_path = self._data_dir / "LJSpeech-1.1.tar.bz2" + print(f"Loading LJSpeech") + download_file(URL_LINKS["dataset"], arch_path) + shutil.unpack_archive(arch_path, self._data_dir) + for fpath in (self._data_dir / "LJSpeech-1.1").iterdir(): + shutil.move(str(fpath), str(self._data_dir / fpath.name)) + os.remove(str(arch_path)) + shutil.rmtree(str(self._data_dir / "LJSpeech-1.1")) + + files = [file_name for file_name in (self._data_dir / "wavs").iterdir()] + train_length = int(0.85 * len(files)) # hand split, test ~ 15% + (self._data_dir / "train").mkdir(exist_ok=True, parents=True) + (self._data_dir / "test").mkdir(exist_ok=True, parents=True) + for i, fpath in enumerate((self._data_dir / "wavs").iterdir()): + if i < train_length: + shutil.move(str(fpath), str(self._data_dir / "train" / fpath.name)) + else: + shutil.move(str(fpath), str(self._data_dir / "test" / fpath.name)) + shutil.rmtree(str(self._data_dir / "wavs")) + + + def _get_or_load_index(self, part): + index_path = self._data_dir / f"{part}_index.json" + if index_path.exists(): + with index_path.open() as f: + index = json.load(f) + else: + index = self._create_index(part) + with index_path.open("w") as f: + json.dump(index, f, indent=2) + return index + + def _create_index(self, part): + index = [] + split_dir = self._data_dir / part + if not split_dir.exists(): + self._load_dataset() + + wav_dirs = set() + for dirpath, dirnames, filenames in os.walk(str(split_dir)): + if any([f.endswith(".wav") for f in filenames]): + wav_dirs.add(dirpath) + for wav_dir in tqdm( + list(wav_dirs), desc=f"Preparing ljspeech folders: {part}" + ): + wav_dir = Path(wav_dir) + trans_path = list(self._data_dir.glob("*.csv"))[0] + with trans_path.open() as f: + for line in f: + w_id = line.split('|')[0] + w_text = " ".join(line.split('|')[1:]).strip() + wav_path = wav_dir / f"{w_id}.wav" + if not wav_path.exists(): # elem in another part + continue + t_info = torchaudio.info(str(wav_path)) + length = t_info.num_frames / t_info.sample_rate + if w_text.isascii(): + index.append( + { + "path": str(wav_path.absolute().resolve()), + "text": w_text.lower(), + "audio_len": length, + } + ) + return index diff --git a/automatic-speech-recognition/hw_asr/logger/__init__.py b/automatic-speech-recognition/hw_asr/logger/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..549cc5b8b1b79ac0e083dde39a2e9bc3d8ae97d4 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/logger/__init__.py @@ -0,0 +1,2 @@ +from .logger import * +from .visualization import * diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb208e512ad7131ebe787418501770e6c245884c Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a05233729be1ab66ad96e9f42763d53d8ecaedda Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28d4e85d512960a484b7bf6f24d0d07bf6d49b6e Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad9c8fb2ed7c9590174fc8a653385aec2ba38d55 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0973e3b4403caa591bd4db6fb42dcdac2aca91b Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7d76b69359863049ae6dd96dbc872b877ad0122 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58cdabae72d30b9cf7092f170cfff0bfdde100a9 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e788e0886a9b20c46dfa58c04f99c66c2f72bd0 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eba8207829c73a7f46182180c9cc2aa2d44b4ffa Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f61661bf61425c290796c127ac31ce9dbec722c Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f0969d440dda7f1e2b494c42db9036a1ec9891c Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08d07b23511995a42b446ae4db99e5ed69d88c65 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/logger/logger.py b/automatic-speech-recognition/hw_asr/logger/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..73759e3f6a7b0a1421768adbd5c66b3aaf2374e4 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/logger/logger.py @@ -0,0 +1,29 @@ +import logging +import logging.config +from pathlib import Path + +from hw_asr.utils import read_json, ROOT_PATH + + +def setup_logging( + save_dir, log_config=None, default_level=logging.INFO +): + """ + Setup logging configuration + """ + if log_config is None: + log_config = str(ROOT_PATH / "hw_asr" / "logger" / "logger_config.json") + log_config = Path(log_config) + if log_config.is_file(): + config = read_json(log_config) + # modify logging paths based on run config + for _, handler in config["handlers"].items(): + if "filename" in handler: + handler["filename"] = str(save_dir / handler["filename"]) + + logging.config.dictConfig(config) + else: + print( + "Warning: logging configuration file is not found in {}.".format(log_config) + ) + logging.basicConfig(level=default_level) diff --git a/automatic-speech-recognition/hw_asr/logger/logger_config.json b/automatic-speech-recognition/hw_asr/logger/logger_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2bfebbf7b373025dfe2030d1a7933dbabb2c9a2f --- /dev/null +++ b/automatic-speech-recognition/hw_asr/logger/logger_config.json @@ -0,0 +1,36 @@ +{ + "version": 1, + "disable_existing_loggers": false, + "formatters": { + "simple": { + "format": "%(message)s" + }, + "datetime": { + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + } + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "level": "DEBUG", + "formatter": "simple", + "stream": "ext://sys.stdout" + }, + "info_file_handler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "INFO", + "formatter": "datetime", + "filename": "info.log", + "maxBytes": 10485760, + "backupCount": 20, + "encoding": "utf8" + } + }, + "root": { + "level": "INFO", + "handlers": [ + "console", + "info_file_handler" + ] + } +} \ No newline at end of file diff --git a/automatic-speech-recognition/hw_asr/logger/tensorboard.py b/automatic-speech-recognition/hw_asr/logger/tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..d8d52e0648ddfc580a074db1fd1ec600523242a5 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/logger/tensorboard.py @@ -0,0 +1,88 @@ +import importlib +from datetime import datetime + + +class TensorboardWriter: + def __init__(self, log_dir, logger, enabled): + self.writer = None + self.selected_module = "" + + if enabled: + log_dir = str(log_dir) + + # Retrieve vizualization writer. + succeeded = False + for module in ["torch.utils.tensorboard", "tensorboardX"]: + try: + self.writer = importlib.import_module(module).SummaryWriter(log_dir) + succeeded = True + break + except ImportError: + succeeded = False + self.selected_module = module + + if not succeeded: + message = ( + "Warning: visualization (Tensorboard) is configured to use, but currently not " + "installed on this machine. Please install TensorboardX with " + "'pip install tensorboardx', upgrade PyTorch to version >= 1.1 to use " + "'torch.utils.tensorboard' or turn off the option in the 'config.json' file." + ) + logger.warning(message) + + self.step = 0 + self.mode = "" + + self.tb_writer_ftns = { + "add_scalar", + "add_scalars", + "add_image", + "add_images", + "add_audio", + "add_text", + "add_histogram", + "add_pr_curve", + "add_embedding", + } + self.tag_mode_exceptions = {"add_histogram", "add_embedding"} + self.timer = datetime.now() + + def set_step(self, step, mode="train"): + self.mode = mode + self.step = step + if step == 0: + self.timer = datetime.now() + else: + duration = datetime.now() - self.timer + self.add_scalar("steps_per_sec", 1 / duration.total_seconds()) + self.timer = datetime.now() + + def __getattr__(self, name): + """ + If visualization is configured to use: + return add_data() methods of tensorboard with additional information (step, tag) added. + Otherwise: + return a blank function handle that does nothing + """ + if name in self.tb_writer_ftns: + add_data = getattr(self.writer, name, None) + + def wrapper(tag, data, *args, **kwargs): + if add_data is not None: + # add mode(train/valid) tag + if name not in self.tag_mode_exceptions: + tag = "{}/{}".format(tag, self.mode) + add_data(tag, data, self.step, *args, **kwargs) + + return wrapper + else: + # default action for returning methods defined in this class, set_step() for instance. + try: + attr = object.__getattr__(name) + except AttributeError: + raise AttributeError( + "type object '{}' has no attribute '{}'".format( + self.selected_module, name + ) + ) + return attr diff --git a/automatic-speech-recognition/hw_asr/logger/utils.py b/automatic-speech-recognition/hw_asr/logger/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bee693ea21562490d20a9c6420e7529329354b1a --- /dev/null +++ b/automatic-speech-recognition/hw_asr/logger/utils.py @@ -0,0 +1,13 @@ +import io + +import matplotlib.pyplot as plt + + +def plot_spectrogram_to_buf(spectrogram_tensor, name=None): + plt.figure(figsize=(20, 5)) + plt.imshow(spectrogram_tensor) + plt.title(name) + buf = io.BytesIO() + plt.savefig(buf, format='png') + buf.seek(0) + return buf diff --git a/automatic-speech-recognition/hw_asr/logger/visualization.py b/automatic-speech-recognition/hw_asr/logger/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..eab6df31f9f96c95681f6172f2e31a43fb3b410b --- /dev/null +++ b/automatic-speech-recognition/hw_asr/logger/visualization.py @@ -0,0 +1,19 @@ +from enum import Enum + +from .tensorboard import TensorboardWriter +from .wandb import WanDBWriter + + +class VisualizerBackendType(str, Enum): + tensorboard = "tensorboard" + wandb = "wandb" + + +def get_visualizer(config, logger, backend: VisualizerBackendType): + if backend == VisualizerBackendType.tensorboard: + return TensorboardWriter(config.log_dir, logger, True) + + if backend == VisualizerBackendType.wandb: + return WanDBWriter(config, logger) + + return None diff --git a/automatic-speech-recognition/hw_asr/logger/wandb.py b/automatic-speech-recognition/hw_asr/logger/wandb.py new file mode 100644 index 0000000000000000000000000000000000000000..56894cd6236ddf6973edb84ba0fc3ad94247f318 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/logger/wandb.py @@ -0,0 +1,98 @@ +from datetime import datetime + +import numpy as np +import pandas as pd +import wandb + + +class WanDBWriter: + def __init__(self, config, logger): + self.writer = None + self.selected_module = "" + + try: + import wandb + wandb.login() + + if config['trainer'].get('wandb_project') is None: + raise ValueError("please specify project name for wandb") + + wandb.init( + project=config['trainer'].get('wandb_project'), + config=config.config + ) + self.wandb = wandb + + except ImportError: + logger.warning("For use wandb install it via \n\t pip install wandb") + + self.step = 0 + self.mode = "" + self.timer = datetime.now() + + def set_step(self, step, mode="train"): + self.mode = mode + self.step = step + if step == 0: + self.timer = datetime.now() + else: + duration = datetime.now() - self.timer + self.add_scalar("steps_per_sec", 1 / duration.total_seconds()) + self.timer = datetime.now() + + def _scalar_name(self, scalar_name): + return f"{scalar_name}_{self.mode}" + + def add_scalar(self, scalar_name, scalar): + self.wandb.log({ + self._scalar_name(scalar_name): scalar, + }, step=self.step) + + def add_scalars(self, tag, scalars): + self.wandb.log({ + **{f"{scalar_name}_{tag}_{self.mode}": scalar for scalar_name, scalar in + scalars.items()} + }, step=self.step) + + def add_image(self, scalar_name, image): + self.wandb.log({ + self._scalar_name(scalar_name): self.wandb.Image(image) + }, step=self.step) + + def add_audio(self, scalar_name, audio, sample_rate=None): + audio = audio.detach().cpu().numpy().T + self.wandb.log({ + self._scalar_name(scalar_name): self.wandb.Audio(audio, sample_rate=sample_rate) + }, step=self.step) + + def add_text(self, scalar_name, text): + self.wandb.log({ + self._scalar_name(scalar_name): self.wandb.Html(text) + }, step=self.step) + + def add_histogram(self, scalar_name, hist, bins=None): + hist = hist.detach().cpu().numpy() + np_hist = np.histogram(hist, bins=bins) + if np_hist[0].shape[0] > 512: + np_hist = np.histogram(hist, bins=512) + + hist = self.wandb.Histogram( + np_histogram=np_hist + ) + + self.wandb.log({ + self._scalar_name(scalar_name): hist + }, step=self.step) + + def add_table(self, table_name, table: pd.DataFrame): + self.wandb.log({self._scalar_name(table_name): wandb.Table(dataframe=table)}, + step=self.step) + + def add_images(self, scalar_name, images): + raise NotImplementedError() + + def add_pr_curve(self, scalar_name, scalar): + raise NotImplementedError() + + def add_embedding(self, scalar_name, scalar): + raise NotImplementedError() diff --git a/automatic-speech-recognition/hw_asr/loss/CTCLossWrapper.py b/automatic-speech-recognition/hw_asr/loss/CTCLossWrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..a57dd573cb3a72b1835d5868f7cda12f9d075d17 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/loss/CTCLossWrapper.py @@ -0,0 +1,20 @@ +import torch +from torch import Tensor +from torch.nn import CTCLoss + + +class CTCLossWrapper(CTCLoss): + + def __init__(self): + super().__init__(zero_infinity=True) + + def forward(self, log_probs, log_probs_length, text_encoded, text_encoded_length, + **batch) -> Tensor: + log_probs_t = torch.transpose(log_probs, 0, 1) + + return super().forward( + log_probs=log_probs_t, + targets=text_encoded, + input_lengths=log_probs_length, + target_lengths=text_encoded_length, + ) diff --git a/automatic-speech-recognition/hw_asr/loss/__init__.py b/automatic-speech-recognition/hw_asr/loss/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8968fea952bb13d7e1b6177f40a71d363307681d --- /dev/null +++ b/automatic-speech-recognition/hw_asr/loss/__init__.py @@ -0,0 +1,5 @@ +from hw_asr.loss.CTCLossWrapper import CTCLossWrapper as CTCLoss + +__all__ = [ + "CTCLoss" +] diff --git a/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-310.pyc b/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91ccce2b7c95f54fe4aa3b850ebc7d8a69e6af73 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-311.pyc b/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3376b136391e850fc2b3b867a9baf8338ebf996f Binary files /dev/null and b/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a5521589d921df6e36c3475f2b6326e9ce0b093 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..826e6f0d2b1d8977f745c179c5ad98ba89b808ab Binary files /dev/null and b/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/metric/__init__.py b/automatic-speech-recognition/hw_asr/metric/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..80b677c33c5e549c030dedabe582eadd70feac90 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/metric/__init__.py @@ -0,0 +1,11 @@ +from hw_asr.metric.cer_metric import ArgmaxCERMetric, BeamSearchCERMetric, LanguageModelCERMetric +from hw_asr.metric.wer_metric import ArgmaxWERMetric, BeamSearchWERMetric, LanguageModelWERMetric + +__all__ = [ + "ArgmaxCERMetric", + "BeamSearchCERMetric", + "LanguageModelCERMetric", + "ArgmaxWERMetric", + "BeamSearchWERMetric", + "LanguageModelWERMetric" +] diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c7c0bba4321305de6648145cdf953eabe997137 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d81781c2a6501997ee8fdb38f9260d225f0f0a0d Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-310.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b998cbef6ef1fe735cb4fb9f401b0ad3ebf91502 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-311.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43cdcc20cb50ada9a2d460dbce7afcda26c06395 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-310.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..663cbcf684425cee893ac1c9ff3cb270c4ef6bd2 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-311.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fb94057757cfdcbba8512a3c68f82fe9616d002 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-310.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2de7f56bdc75ecc25e87e918781e4305cf509ef Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-311.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f9901c140b702440645ef5ec03b11728e3ede18 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/metric/cer_metric.py b/automatic-speech-recognition/hw_asr/metric/cer_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..54559a3dad1daf31d292ac75bf7791d522fbe078 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/metric/cer_metric.py @@ -0,0 +1,68 @@ +from typing import List + +import numpy as np +import torch +from torch import Tensor + +from hw_asr.base.base_metric import BaseMetric +from hw_asr.base.base_text_encoder import BaseTextEncoder +from hw_asr.metric.utils import calc_cer + + +class ArgmaxCERMetric(BaseMetric): + def __init__(self, text_encoder: BaseTextEncoder, *args, **kwargs): + super().__init__(*args, **kwargs) + self.text_encoder = text_encoder + + def __call__(self, log_probs: Tensor, log_probs_length: Tensor, text: List[str], **kwargs): + cers = [] + predictions = torch.argmax(log_probs.cpu(), dim=-1).numpy() + lengths = log_probs_length.detach().numpy() + for log_prob_vec, length, target_text in zip(predictions, lengths, text): + target_text = BaseTextEncoder.normalize_text(target_text) + if hasattr(self.text_encoder, "ctc_decode"): + pred_text = self.text_encoder.ctc_decode(log_prob_vec[:length]) + else: + pred_text = self.text_encoder.decode(log_prob_vec[:length]) + cers.append(calc_cer(target_text, pred_text)) + return sum(cers) / len(cers) + + +class BeamSearchCERMetric(BaseMetric): + def __init__(self, text_encoder: BaseTextEncoder, beam_size: int, *args, **kwargs): + super().__init__(*args, **kwargs) + self.text_encoder = text_encoder + self.beam_size = beam_size + + def __call__(self, log_probs: Tensor, log_probs_length: Tensor, text: List[str], **kwargs): + cers = [] + probs = np.exp(log_probs.detach().cpu().numpy()) + lengths = log_probs_length.detach().numpy() + for prob, length, target_text in zip(probs, lengths, text): + target_text = BaseTextEncoder.normalize_text(target_text) + if hasattr(self.text_encoder, "ctc_beam_search"): + pred_text = self.text_encoder.ctc_beam_search(prob[:length], self.beam_size) + else: + assert False + cers.append(calc_cer(target_text, pred_text)) + return sum(cers) / len(cers) + + +class LanguageModelCERMetric(BaseMetric): + def __init__(self, text_encoder: BaseTextEncoder, *args, **kwargs): + super().__init__(*args, **kwargs) + self.text_encoder = text_encoder + + def __call__(self, logits: Tensor, log_probs_length: Tensor, text: List[str], **kwargs): + cers = [] + logits = logits.detach().cpu().numpy() + lengths = log_probs_length.detach().numpy() + for logit, length, target_text in zip(logits, lengths, text): + target_text = BaseTextEncoder.normalize_text(target_text) + if hasattr(self.text_encoder, "ctc_lm_beam_search"): + pred_text = self.text_encoder.ctc_lm_beam_search(logit[:length]) + else: + assert False + cers.append(calc_cer(target_text, pred_text)) + return sum(cers) / len(cers) + diff --git a/automatic-speech-recognition/hw_asr/metric/utils.py b/automatic-speech-recognition/hw_asr/metric/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e38484975cef2424e8a19f1680cee029e945ae61 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/metric/utils.py @@ -0,0 +1,18 @@ +import editdistance +# Don't forget to support cases when target_text == '' + + +def calc_cer(target_text, predicted_text) -> float: + # TODO: your code here + return editdistance.eval(target_text, predicted_text) / len(target_text) + + +def calc_wer(target_text, predicted_text) -> float: + # TODO: your code here + if not target_text: + if predicted_text: + return 1 + return 0 + target_text_splitted = target_text.split(' ') + predicted_text_splitted = predicted_text.split(' ') + return editdistance.eval(target_text_splitted, predicted_text_splitted) / len(target_text_splitted) diff --git a/automatic-speech-recognition/hw_asr/metric/wer_metric.py b/automatic-speech-recognition/hw_asr/metric/wer_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..2bc03ce0b867e9342b835687fc0279f2a4d12aa3 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/metric/wer_metric.py @@ -0,0 +1,67 @@ +from typing import List + +import numpy as np +import torch +from torch import Tensor + +from hw_asr.base.base_metric import BaseMetric +from hw_asr.base.base_text_encoder import BaseTextEncoder +from hw_asr.metric.utils import calc_wer + + +class ArgmaxWERMetric(BaseMetric): + def __init__(self, text_encoder: BaseTextEncoder, *args, **kwargs): + super().__init__(*args, **kwargs) + self.text_encoder = text_encoder + + def __call__(self, log_probs: Tensor, log_probs_length: Tensor, text: List[str], **kwargs): + wers = [] + predictions = torch.argmax(log_probs.cpu(), dim=-1).numpy() + lengths = log_probs_length.detach().numpy() + for log_prob_vec, length, target_text in zip(predictions, lengths, text): + target_text = BaseTextEncoder.normalize_text(target_text) + if hasattr(self.text_encoder, "ctc_decode"): + pred_text = self.text_encoder.ctc_decode(log_prob_vec[:length]) + else: + pred_text = self.text_encoder.decode(log_prob_vec[:length]) + wers.append(calc_wer(target_text, pred_text)) + return sum(wers) / len(wers) + + +class BeamSearchWERMetric(BaseMetric): + def __init__(self, text_encoder: BaseTextEncoder, beam_size: int, *args, **kwargs): + super().__init__(*args, **kwargs) + self.text_encoder = text_encoder + self.beam_size = beam_size + + def __call__(self, log_probs: Tensor, log_probs_length: Tensor, text: List[str], **kwargs): + wers = [] + probs = np.exp(log_probs.detach().cpu().numpy()) + lengths = log_probs_length.detach().numpy() + for prob, length, target_text in zip(probs, lengths, text): + target_text = BaseTextEncoder.normalize_text(target_text) + if hasattr(self.text_encoder, "ctc_beam_search"): + pred_text = self.text_encoder.ctc_beam_search(prob[:length], self.beam_size) + else: + assert False + wers.append(calc_wer(target_text, pred_text)) + return sum(wers) / len(wers) + + +class LanguageModelWERMetric(BaseMetric): + def __init__(self, text_encoder: BaseTextEncoder, *args, **kwargs): + super().__init__(*args, **kwargs) + self.text_encoder = text_encoder + + def __call__(self, logits: Tensor, log_probs_length: Tensor, text: List[str], **kwargs): + wers = [] + logits = logits.detach().cpu().numpy() + lengths = log_probs_length.detach().numpy() + for logit, length, target_text in zip(logits, lengths, text): + target_text = BaseTextEncoder.normalize_text(target_text) + if hasattr(self.text_encoder, "ctc_lm_beam_search"): + pred_text = self.text_encoder.ctc_lm_beam_search(logit[:length]) + else: + assert False + wers.append(calc_wer(target_text, pred_text)) + return sum(wers) / len(wers) diff --git a/automatic-speech-recognition/hw_asr/model/__init__.py b/automatic-speech-recognition/hw_asr/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c4f0944e4bad49606bed457888ea8cc5001756e9 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/model/__init__.py @@ -0,0 +1,7 @@ +from hw_asr.model.baseline_model import BaselineModel +from hw_asr.model.deepspeech2_model import DeepSpeech2Model + +__all__ = [ + "BaselineModel", + "DeepSpeech2Model", +] diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b7e11462c91ca31d516b2d0713cbe93f7169df6 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32df7c0ed01c13feffa4418603a6e6332e44f112 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-310.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1b39436e727354077f6d7fe6be3513d1df086b5 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-311.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fb1054c2322a4b0d0f517525cb90ec0b252b824 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-310.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3938bc9b0d8cef8a17cb375aa0eaa4c1d8def54 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-311.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5d84b7ca5b16309977560be17d96d06e70ede51 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/model/baseline_model.py b/automatic-speech-recognition/hw_asr/model/baseline_model.py new file mode 100644 index 0000000000000000000000000000000000000000..1d29513ecaadb7aaefddbf89a1852fba3bbe2bdb --- /dev/null +++ b/automatic-speech-recognition/hw_asr/model/baseline_model.py @@ -0,0 +1,24 @@ +from torch import nn +from torch.nn import Sequential + +from hw_asr.base import BaseModel + + +class BaselineModel(BaseModel): + def __init__(self, n_feats, n_class, fc_hidden=512, **batch): + super().__init__(n_feats, n_class, **batch) + self.net = Sequential( + # people say it can aproximate any function... + nn.Linear(in_features=n_feats, out_features=fc_hidden), + nn.ReLU(), + nn.Linear(in_features=fc_hidden, out_features=fc_hidden), + nn.ReLU(), + nn.Linear(in_features=fc_hidden, out_features=n_class) + ) + + def forward(self, spectrogram, **batch): + print(self.net(spectrogram.transpose(1, 2)).shape) + return {"logits": self.net(spectrogram.transpose(1, 2))} + + def transform_input_lengths(self, input_lengths): + return input_lengths # we don't reduce time dimension here diff --git a/automatic-speech-recognition/hw_asr/model/deepspeech2_model.py b/automatic-speech-recognition/hw_asr/model/deepspeech2_model.py new file mode 100644 index 0000000000000000000000000000000000000000..605681fc8223df1e9c0dda5982174a02a3877efb --- /dev/null +++ b/automatic-speech-recognition/hw_asr/model/deepspeech2_model.py @@ -0,0 +1,84 @@ +import torch +from torch import nn + +from hw_asr.base import BaseModel + + +class RNNwBatchNorm(nn.Module): + def __init__(self, input_size, hidden_size, rnn_dropout): + super().__init__() + self.rnn = nn.GRU(input_size, hidden_size, dropout=rnn_dropout, batch_first=False, bidirectional=True) + self.norm = nn.BatchNorm1d(hidden_size) + + def forward(self, x, h=None): + # N x T x input_size + x, h = self.rnn(x, h) + # T x N x (2 * hidden_size) + x = x.view(x.shape[0], x.shape[1], 2, -1).sum(2) + # T x N x hidden_size + t_dim, n_dim = x.shape[0], x.shape[1] + x = x.view((t_dim * n_dim, -1)) + x = self.norm(x) + x = x.view((t_dim, n_dim, -1)).contiguous() + return x, h + + +# https://proceedings.mlr.press/v48/amodei16.pdf +class DeepSpeech2Model(BaseModel): + def __init__(self, n_feats, n_rnn_layers, rnn_hidden_size, rnn_dropout, n_class): + assert n_rnn_layers >= 1 + super().__init__(n_feats, n_class) + + self.conv = nn.Sequential( + nn.Conv2d(in_channels=1, out_channels=32, padding=(20, 5), kernel_size=(41, 11), stride=(2, 2)), + nn.BatchNorm2d(32), + nn.ReLU(), + + nn.Conv2d(in_channels=32, out_channels=32, padding=(10, 5), kernel_size=(21, 11), stride=(2, 2)), + nn.BatchNorm2d(32), + nn.ReLU(), + + nn.Conv2d(in_channels=32, out_channels=96, padding=(10, 5), kernel_size=(21, 11), stride=(2, 1)), + nn.BatchNorm2d(96), + nn.ReLU(), + ) + + rnn_input_size = (n_feats + 2 * 20 - 41) // 2 + 1 + rnn_input_size = (rnn_input_size + 2 * 10 - 21) // 2 + 1 + rnn_input_size = (rnn_input_size + 2 * 10 - 21) // 2 + 1 + rnn_input_size *= 96 + self.rnns = nn.Sequential( + RNNwBatchNorm(rnn_input_size, rnn_hidden_size, rnn_dropout), + *(RNNwBatchNorm(rnn_hidden_size, rnn_hidden_size, rnn_dropout) for _ in range(n_rnn_layers - 1)) + ) + + self.fc = nn.Linear(in_features=rnn_hidden_size, out_features=n_class) + self.softmax = nn.Softmax(dim=2) + + def forward(self, spectrogram, spectrogram_length, **batch): + # N x big_F x big_T + x = self.conv(spectrogram.unsqueeze(1)) + # N x C x F x T + x = x.view(x.shape[0], x.shape[1] * x.shape[2], x.shape[3]) + # N x (C * F) x T + x = x.transpose(1, 2).transpose(0, 1).contiguous() + # T x N x (C * F) + h = None + for rnn in self.rnns: + x, h = rnn(x, h) + # T x N x rnn_hidden_size + t_dim, n_dim = x.shape[0], x.shape[1] + x = x.view((t_dim * n_dim, -1)) + x = self.fc(x) + x = x.view((t_dim, n_dim, -1)).transpose(0, 1) + # N x T x n_class + return {"logits": x} + + def transform_input_lengths(self, input_lengths): + t_dim = input_lengths.max() + + t_dim = (t_dim + 2 * 5 - 11) // 2 + 1 + t_dim = (t_dim + 2 * 5 - 11) // 2 + 1 + t_dim = (t_dim + 2 * 5 - 11) + 1 + + return torch.zeros_like(input_lengths).fill_(t_dim) \ No newline at end of file diff --git a/automatic-speech-recognition/hw_asr/tests/__init__.py b/automatic-speech-recognition/hw_asr/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/automatic-speech-recognition/hw_asr/tests/config.json b/automatic-speech-recognition/hw_asr/tests/config.json new file mode 100644 index 0000000000000000000000000000000000000000..8491ebc8ea3448f5a59b5cd17d76df23eefb3ecf --- /dev/null +++ b/automatic-speech-recognition/hw_asr/tests/config.json @@ -0,0 +1,100 @@ +{ + "name": "test_config", + "n_gpu": 1, + "preprocessing": { + "sr": 16000, + "spectrogram": { + "type": "MelSpectrogram", + "args": { + } + }, + "log_spec": true + }, + "augmentations": { + "wave": [], + "spectrogram": [] + }, + "arch": { + "type": "BaselineModel", + "args": { + "n_feats": 128, + "fc_hidden": 512 + } + }, + "data": { + "train": { + "batch_size": 20, + "num_workers": 0, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean", + "max_audio_length": 20.0, + "max_text_length": 200 + } + } + ] + }, + "val": { + "batch_size": 20, + "num_workers": 0, + "datasets": [ + { + "type": "LibrispeechDataset", + "args": { + "part": "dev-clean", + "max_audio_length": 20.0, + "max_text_length": 200 + } + } + ] + } + }, + "optimizer": { + "type": "SGD", + "args": { + "lr": 3e-4 + } + }, + "loss": { + "type": "CTCLoss", + "args": {} + }, + "metrics": [ + { + "type": "ArgmaxWERMetric", + "args": { + "name": "WER (argmax)" + } + }, + { + "type": "ArgmaxCERMetric", + "args": { + "name": "CER (argmax)" + } + } + ], + "lr_scheduler": { + "type": "OneCycleLR", + "args": { + "steps_per_epoch": 100, + "epochs": 50, + "anneal_strategy": "cos", + "max_lr": 4e-3, + "pct_start": 0.2 + } + }, + "trainer": { + "epochs": 50, + "save_dir": "saved/", + "save_period": 5, + "verbosity": 2, + "monitor": "min val_loss", + "early_stop": 100, + "visualize": "wandb", + "wandb_project": "asr_project", + "len_epoch": 100, + "grad_norm_clip": 10 + } +} diff --git a/automatic-speech-recognition/hw_asr/tests/test_config.py b/automatic-speech-recognition/hw_asr/tests/test_config.py new file mode 100644 index 0000000000000000000000000000000000000000..502cac4036c5dd9568c44cfab4a5e0f89dc9510b --- /dev/null +++ b/automatic-speech-recognition/hw_asr/tests/test_config.py @@ -0,0 +1,12 @@ +import json +import unittest + +from hw_asr.tests.utils import clear_log_folder_after_use +from hw_asr.utils.parse_config import ConfigParser + + +class TestConfig(unittest.TestCase): + def test_create(self): + config_parser = ConfigParser.get_test_configs() + with clear_log_folder_after_use(config_parser): + json.dumps(config_parser.config, indent=2) diff --git a/automatic-speech-recognition/hw_asr/tests/test_dataloader.py b/automatic-speech-recognition/hw_asr/tests/test_dataloader.py new file mode 100644 index 0000000000000000000000000000000000000000..b71f5d18e3011389d2704d810226afd22b135543 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/tests/test_dataloader.py @@ -0,0 +1,55 @@ +import unittest + +from tqdm import tqdm + +from hw_asr.collate_fn.collate import collate_fn +from hw_asr.datasets import LibrispeechDataset +from hw_asr.tests.utils import clear_log_folder_after_use +from hw_asr.utils.object_loading import get_dataloaders +from hw_asr.utils.parse_config import ConfigParser + + +class TestDataloader(unittest.TestCase): + def test_collate_fn(self): + config_parser = ConfigParser.get_test_configs() + with clear_log_folder_after_use(config_parser): + ds = LibrispeechDataset( + "dev-clean", text_encoder=config_parser.get_text_encoder(), + config_parser=config_parser + ) + + batch_size = 3 + batch = collate_fn([ds[i] for i in range(batch_size)]) + + self.assertIn("spectrogram", batch) # torch.tensor + batch_size_dim, feature_length_dim, time_dim = batch["spectrogram"].shape + self.assertEqual(batch_size_dim, batch_size) + self.assertEqual(feature_length_dim, 128) + + self.assertIn("text_encoded", batch) # [int] torch.tensor + # joined and padded indexes representation of transcriptions + batch_size_dim, text_length_dim = batch["text_encoded"].shape + self.assertEqual(batch_size_dim, batch_size) + + self.assertIn("text_encoded_length", batch) # [int] torch.tensor + # contains lengths of each text entry + self.assertEqual(len(batch["text_encoded_length"].shape), 1) + batch_size_dim = batch["text_encoded_length"].shape[0] + self.assertEqual(batch_size_dim, batch_size) + + self.assertIn("text", batch) # List[str] + # simple list of initial normalized texts + batch_size_dim = len(batch["text"]) + self.assertEqual(batch_size_dim, batch_size) + + def test_dataloaders(self): + _TOTAL_ITERATIONS = 10 + config_parser = ConfigParser.get_test_configs() + with clear_log_folder_after_use(config_parser): + dataloaders = get_dataloaders(config_parser, config_parser.get_text_encoder()) + for part in ["train", "val"]: + dl = dataloaders[part] + for i, batch in tqdm(enumerate(iter(dl)), total=_TOTAL_ITERATIONS, + desc=f"Iterating over {part}"): + if i >= _TOTAL_ITERATIONS: + break diff --git a/automatic-speech-recognition/hw_asr/tests/test_datasets.py b/automatic-speech-recognition/hw_asr/tests/test_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..c1069274054723c840db33c407391e420828ec95 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/tests/test_datasets.py @@ -0,0 +1,100 @@ +import unittest + +import torch + +from hw_asr.datasets import LibrispeechDataset, CustomDirAudioDataset, CustomAudioDataset +from hw_asr.tests.utils import clear_log_folder_after_use +from hw_asr.text_encoder.ctc_char_text_encoder import CTCCharTextEncoder +from hw_asr.utils import ROOT_PATH +from hw_asr.utils.parse_config import ConfigParser + + +class TestDataset(unittest.TestCase): + def test_librispeech(self): + config_parser = ConfigParser.get_test_configs() + with clear_log_folder_after_use(config_parser): + ds = LibrispeechDataset( + "dev-clean", + text_encoder=config_parser.get_text_encoder(), + config_parser=config_parser, + max_text_length=140, + max_audio_length=13, + limit=10, + ) + self._assert_training_example_is_good(ds[0]) + + def test_custom_dir_dataset(self): + config_parser = ConfigParser.get_test_configs() + with clear_log_folder_after_use(config_parser): + audio_dir = str(ROOT_PATH / "test_data" / "audio") + transc_dir = str(ROOT_PATH / "test_data" / "transcriptions") + + ds = CustomDirAudioDataset( + audio_dir, + transc_dir, + text_encoder=config_parser.get_text_encoder(), + config_parser=config_parser, + limit=10, + max_audio_length=8, + max_text_length=130, + ) + self._assert_training_example_is_good(ds[0]) + + def test_custom_dataset(self): + config_parser = ConfigParser.get_test_configs() + with clear_log_folder_after_use(config_parser): + audio_path = ROOT_PATH / "test_data" / "audio" + transc_path = ROOT_PATH / "test_data" / "transcriptions" + with (transc_path / "84-121550-0000.txt").open() as f: + transcription = f.read().strip() + data = [ + { + "path": str(audio_path / "84-121550-0001.flac"), + }, + { + "path": str(audio_path / "84-121550-0000.flac"), + "text": transcription + } + ] + + ds = CustomAudioDataset( + data=data, + text_encoder=config_parser.get_text_encoder(), + config_parser=config_parser, + ) + self._assert_training_example_is_good(ds[0], contains_text=False) + self._assert_training_example_is_good(ds[1]) + + def _assert_training_example_is_good(self, training_example: dict, contains_text=True): + + for field, expected_type in [ + ("audio", torch.Tensor), + ("spectrogram", torch.Tensor), + ("duration", float), + ("audio_path", str), + ("text", str), + ("text_encoded", torch.Tensor) + ]: + self.assertIn(field, training_example, f"Error during checking field {field}") + self.assertIsInstance(training_example[field], expected_type, + f"Error during checking field {field}") + + # check waveform dimensions + batch_dim, audio_dim, = training_example["audio"].size() + self.assertEqual(batch_dim, 1) + self.assertGreater(audio_dim, 1) + + # check spectrogram dimensions + batch_dim, freq_dim, time_dim = training_example["spectrogram"].size() + self.assertEqual(batch_dim, 1) + self.assertEqual(freq_dim, 128) + self.assertGreater(time_dim, 1) + + # check text tensor dimensions + batch_dim, length_dim, = training_example["text_encoded"].size() + self.assertEqual(batch_dim, 1) + if contains_text: + self.assertGreater(length_dim, 1) + else: + self.assertEqual(length_dim, 0) + self.assertEqual(training_example["text"], "") diff --git a/automatic-speech-recognition/hw_asr/tests/test_text_encoder.py b/automatic-speech-recognition/hw_asr/tests/test_text_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..fae65114d3163f60ba3564de2f964354ea31ad11 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/tests/test_text_encoder.py @@ -0,0 +1,22 @@ +import unittest + +from hw_asr.text_encoder.ctc_char_text_encoder import CTCCharTextEncoder + + +class TestTextEncoder(unittest.TestCase): + def test_ctc_decode(self): + text_encoder = CTCCharTextEncoder() + text = "i^^ ^w^i^sss^hhh^ i ^^^s^t^aaaar^teee^d " \ + "dddddd^oooo^in^g tttttttth^iiiis h^^^^^^^^w^ e^a^r^li^er" + true_text = "i wish i started doing this hw earlier" + inds = [text_encoder.char2ind[c] for c in text] + decoded_text = text_encoder.ctc_decode(inds) + self.assertIn(decoded_text, true_text) + + # def test_beam_search(self): + # # TODO: (optional) write tests for beam search + # text_encoder = CTCCharTextEncoder() + + # len(text_encoder.ind2char) + # probs + # text_encoder.ctc_beam_search() diff --git a/automatic-speech-recognition/hw_asr/tests/utils.py b/automatic-speech-recognition/hw_asr/tests/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..181b03ed8a1b999a84ababbb44573e165ebb8a1b --- /dev/null +++ b/automatic-speech-recognition/hw_asr/tests/utils.py @@ -0,0 +1,22 @@ +import platform +import shutil +from contextlib import contextmanager +from time import sleep + +from hw_asr.utils.parse_config import ConfigParser + + +@contextmanager +def clear_log_folder_after_use(config_parser: ConfigParser): + # this context manager deletes the log folders weather the body was executed succesfully or not + try: + yield config_parser + finally: + if platform.system() == "Windows": + # Running unittest on windows results in a delete lock on the log directories just skip + # this cleanup for windows and wait 1s to have a different experiment name. + # (if you know how to fix it, you are welcome to make pull request) + sleep(1) + else: + shutil.rmtree(config_parser.save_dir) + shutil.rmtree(config_parser.log_dir) diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__init__.py b/automatic-speech-recognition/hw_asr/text_encoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4ffac14abbfc1939926914d6244e6e689a953cf3 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/text_encoder/__init__.py @@ -0,0 +1,7 @@ +from .char_text_encoder import CharTextEncoder +from .ctc_char_text_encoder import CTCCharTextEncoder + +__all__ = [ + "CharTextEncoder", + "CTCCharTextEncoder" +] diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5be326471283cf40d0cd9caa188fda5b7dcd8372 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..155dfc0394e51d8e8d82300274cd9c4179eb339d Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-310.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bf0ce7e443a862db9acf39f6b37c0a8228b83ac Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-311.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..daafd6df7d0a7d0bcad85358531cbc6480665abc Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-310.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0bf3d80535bff4e5639500722938f72ef3a26842 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-311.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63ec6327d8ea6fe2bdd3ddfd407a4dd95858fd7d Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/text_encoder/char_text_encoder.py b/automatic-speech-recognition/hw_asr/text_encoder/char_text_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..b1f8ef0826cf2ce0e6c34fce0d1cabb88d0acbb6 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/text_encoder/char_text_encoder.py @@ -0,0 +1,51 @@ +import json +from pathlib import Path +from string import ascii_lowercase +from typing import List, Union + +import numpy as np +from torch import Tensor + +from hw_asr.base.base_text_encoder import BaseTextEncoder + + +class CharTextEncoder(BaseTextEncoder): + + def __init__(self, alphabet: List[str] = None): + if alphabet is None: + alphabet = list(ascii_lowercase + ' ') + self.alphabet = alphabet + self.ind2char = {k: v for k, v in enumerate(sorted(alphabet))} + self.char2ind = {v: k for k, v in self.ind2char.items()} + + def __len__(self): + return len(self.ind2char) + + def __getitem__(self, item: int): + assert type(item) is int + return self.ind2char[item] + + def encode(self, text) -> Tensor: + text = self.normalize_text(text) + try: + return Tensor([self.char2ind[char] for char in text]).unsqueeze(0) + except KeyError as e: + unknown_chars = set([char for char in text if char not in self.char2ind]) + raise Exception( + f"Can't encode text '{text}'. Unknown chars: '{' '.join(unknown_chars)}'") + + def decode(self, vector: Union[Tensor, np.ndarray, List[int]]): + return ''.join([self.ind2char[int(ind)] for ind in vector]).strip() + + def dump(self, file): + with Path(file).open('w') as f: + json.dump(self.ind2char, f) + + @classmethod + def from_file(cls, file): + with Path(file).open() as f: + ind2char = json.load(f) + a = cls([]) + a.ind2char = ind2char + a.char2ind = {v: k for k, v in ind2char} + return a diff --git a/automatic-speech-recognition/hw_asr/text_encoder/ctc_char_text_encoder.py b/automatic-speech-recognition/hw_asr/text_encoder/ctc_char_text_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..90e23b5fcd5b425e8219e7a6d016289d65ce86f0 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/text_encoder/ctc_char_text_encoder.py @@ -0,0 +1,86 @@ +from typing import List, NamedTuple + +import torch +from pyctcdecode import build_ctcdecoder + + +from hw_asr.base.base_text_encoder import BaseTextEncoder +from .char_text_encoder import CharTextEncoder +from collections import defaultdict + + +class Hypothesis(NamedTuple): + text: str + prob: float + + +class CTCCharTextEncoder(CharTextEncoder): + EMPTY_TOK = "^" + + def __init__(self, alphabet: List[str] = None, kenlm_model_path: str = None, unigrams_path: str = None): + super().__init__(alphabet) + vocab = [self.EMPTY_TOK] + list(self.alphabet) + self.ind2char = dict(enumerate(vocab)) + self.char2ind = {v: k for k, v in self.ind2char.items()} + if kenlm_model_path is not None: + with open(unigrams_path) as f: + unigrams = [line.strip() for line in f.readlines()] + self.decoder = build_ctcdecoder(labels=[""] + self.alphabet, kenlm_model_path=kenlm_model_path, unigrams=unigrams) + + def ctc_decode(self, inds: List[int]) -> str: + # TODO: your code here + result = [] + last_char = self.EMPTY_TOK + for ind in inds: + cur_char = self.ind2char[ind] + if cur_char != self.EMPTY_TOK and last_char != cur_char: + result.append(cur_char) + last_char = cur_char + return ''.join(result) + + def ctc_beam_search(self, probs: torch.tensor, beam_size: int) -> str: + """ + Performs beam search and returns a list of pairs (hypothesis, hypothesis probability). + """ + assert len(probs.shape) == 2 + char_length, voc_size = probs.shape + assert voc_size == len(self.ind2char) + hypos: List[Hypothesis] = [] + # TODO: your code here + + def extend_and_merge(frame, state): + new_state = defaultdict(float) + for next_char_index, next_char_proba in enumerate(frame): + for (pref, last_char), pref_proba in state.items(): + next_char = self.ind2char[next_char_index] + if next_char == last_char: + new_pref = pref + else: + if next_char != self.EMPTY_TOK: + new_pref = pref + next_char + else: + new_pref = pref + last_char = next_char + new_state[(new_pref, last_char)] += pref_proba * next_char_proba + return new_state + + def truncate(state, beam_size): + state_list = list(state.items()) + state_list.sort(key=lambda x: -x[1]) + return dict(state_list[:beam_size]) + + state = {('', self.EMPTY_TOK): 1.0} + for frame in probs: + state = extend_and_merge(frame, state) + state = truncate(state, beam_size) + state_list = list(state.items()) + state_list.sort(key=lambda x: -x[1]) + + # for state in state_list: + # hypos.append(Hypothesis(state[0][0], state[1])) + + return state_list[0][0][0] + + def ctc_lm_beam_search(self, logits: torch.tensor) -> str: + assert self.decoder is not None + return self.decoder.decode(logits, beam_width=500).lower() \ No newline at end of file diff --git a/automatic-speech-recognition/hw_asr/text_encoder/fix_vocab.py b/automatic-speech-recognition/hw_asr/text_encoder/fix_vocab.py new file mode 100644 index 0000000000000000000000000000000000000000..265381035a09cd61a20a9980805f04f7628aa5d7 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/text_encoder/fix_vocab.py @@ -0,0 +1,9 @@ +fin = open("hw_asr/text_encoder/librispeech-vocab.txt", "r") +fout = open("hw_asr/text_encoder/librispeech-fixed-vocab.txt", "w+") + +while line := fin.readline(): + line = line.lower().replace("'", "") + print(line, end="", file=fout) + +fin.close() +fout.close() diff --git a/automatic-speech-recognition/hw_asr/text_encoder/lower_model.py b/automatic-speech-recognition/hw_asr/text_encoder/lower_model.py new file mode 100644 index 0000000000000000000000000000000000000000..f90332627a79cdbcabae5515fae11a6f1790a724 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/text_encoder/lower_model.py @@ -0,0 +1,7 @@ +model_path = "3-gram.arpa" +lower_model_path = "lower_3-gram.arpa" + +with open(model_path, 'r') as f1: + with open(lower_model_path, "w") as f2: + for line in f1: + f2.write(line.lower()) \ No newline at end of file diff --git a/automatic-speech-recognition/hw_asr/trainer/__init__.py b/automatic-speech-recognition/hw_asr/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5c0a8a4a91724515aee0aecd8217cfe16ee5ec80 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/trainer/__init__.py @@ -0,0 +1 @@ +from .trainer import * diff --git a/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7ab25f648aa074db07b92d5c059d85f0e049a26 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f634f50ab8078b682cbe615cf6d8852962779d8 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-310.pyc b/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86f0ec4817d9ef593eebf8ba1cdd959c47bbb4bc Binary files /dev/null and b/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-311.pyc b/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d535aec2f2056fee1505924742f4becb89364dd Binary files /dev/null and b/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/trainer/trainer.py b/automatic-speech-recognition/hw_asr/trainer/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..f4b6b786bdd04ebf095764a3a5dfcdbd22f68fe8 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/trainer/trainer.py @@ -0,0 +1,274 @@ +import random +from pathlib import Path +from random import shuffle + +import PIL +import pandas as pd +import numpy as np +import torch +import torch.nn.functional as F +from torch.nn.utils import clip_grad_norm_ +from torchvision.transforms import ToTensor +from tqdm import tqdm + +from hw_asr.base import BaseTrainer +from hw_asr.base.base_text_encoder import BaseTextEncoder +from hw_asr.logger.utils import plot_spectrogram_to_buf +from hw_asr.metric.utils import calc_cer, calc_wer +from hw_asr.utils import inf_loop, MetricTracker + + +class Trainer(BaseTrainer): + """ + Trainer class + """ + + def __init__( + self, + model, + criterion, + metrics, + optimizer, + config, + device, + dataloaders, + text_encoder, + lr_scheduler=None, + len_epoch=None, + skip_oom=True, + ): + super().__init__(model, criterion, metrics, optimizer, config, device) + self.skip_oom = skip_oom + self.text_encoder = text_encoder + self.config = config + self.train_dataloader = dataloaders["train"] + if len_epoch is None: + # epoch-based training + self.len_epoch = len(self.train_dataloader) + else: + # iteration-based training + self.train_dataloader = inf_loop(self.train_dataloader) + self.len_epoch = len_epoch + self.evaluation_dataloaders = {k: v for k, v in dataloaders.items() if k != "train"} + self.lr_scheduler = lr_scheduler + self.log_step = 50 + + self.train_metrics = MetricTracker("loss", "grad norm", *[m.name for m in self.metrics], writer=self.writer) + self.evaluation_metrics = MetricTracker("loss", *[m.name for m in self.metrics], writer=self.writer) + + @staticmethod + def move_batch_to_device(batch, device: torch.device): + """ + Move all necessary tensors to the HPU + """ + for tensor_for_gpu in ["spectrogram", "text_encoded"]: + batch[tensor_for_gpu] = batch[tensor_for_gpu].to(device) + return batch + + def _clip_grad_norm(self): + if self.config["trainer"].get("grad_norm_clip", None) is not None: + clip_grad_norm_(self.model.parameters(), self.config["trainer"]["grad_norm_clip"]) + + def _train_epoch(self, epoch): + """ + Training logic for an epoch + + :param epoch: Integer, current training epoch. + :return: A log that contains average loss and metric in this epoch. + """ + self.model.train() + self.train_metrics.reset() + self.writer.add_scalar("epoch", epoch) + for batch_idx, batch in enumerate(tqdm(self.train_dataloader, desc="train", total=self.len_epoch - 1)): + try: + batch = self.process_batch( + batch, + is_train=True, + metrics=self.train_metrics, + ) + except RuntimeError as e: + if "out of memory" in str(e) and self.skip_oom: + self.logger.warning("OOM on batch. Skipping batch.") + for p in self.model.parameters(): + if p.grad is not None: + del p.grad # free some memory + torch.cuda.empty_cache() + continue + else: + raise e + self.train_metrics.update("grad norm", self.get_grad_norm()) + if batch_idx % self.log_step == 0: + self.writer.set_step((epoch - 1) * self.len_epoch + batch_idx) + self.logger.debug("Train Epoch: {} {} Loss: {:.6f}".format(epoch, self._progress(batch_idx), batch["loss"].item())) + self.writer.add_scalar("learning rate", self.lr_scheduler.get_last_lr()[0]) + self._log_predictions(**batch) + self._log_spectrogram(batch["spectrogram"]) + self._log_scalars(self.train_metrics) + # we don't want to reset train metrics at the start of every epoch + # because we are interested in recent train metrics + last_train_metrics = self.train_metrics.result() + self.train_metrics.reset() + if batch_idx + 1 >= self.len_epoch: + break + log = last_train_metrics + + for part, dataloader in self.evaluation_dataloaders.items(): + val_log = self._evaluation_epoch(epoch, part, dataloader) + log.update(**{f"{part}_{name}": value for name, value in val_log.items()}) + + return log + + def process_batch(self, batch, is_train: bool, metrics: MetricTracker, part: str = None, epoch: int = None): + batch = self.move_batch_to_device(batch, self.device) + if is_train: + self.optimizer.zero_grad() + outputs = self.model(**batch) + if type(outputs) is dict: + batch.update(outputs) + else: + batch["logits"] = outputs + + batch["log_probs"] = F.log_softmax(batch["logits"], dim=-1) + batch["log_probs_length"] = self.model.transform_input_lengths(batch["spectrogram_length"]) + batch["loss"] = self.criterion(**batch) + if is_train: + batch["loss"].backward() + self._clip_grad_norm() + self.optimizer.step() + if self.lr_scheduler is not None: + self.lr_scheduler.step() + + metrics.update("loss", batch["loss"].item()) + for met in self.metrics: + is_not_test = is_train or ("val" in part) + is_test = not is_not_test + hard_to_calc_metric = "beam search" in met.name or "LM" in met.name + if hard_to_calc_metric and (is_not_test or (is_test and (epoch % 25) != 0)): + continue + metrics.update(met.name, met(**batch)) + return batch + + def _evaluation_epoch(self, epoch, part, dataloader): + """ + Validate after training an epoch + + :param epoch: Integer, current training epoch. + :return: A log that contains information about validation + """ + self.model.eval() + self.evaluation_metrics.reset() + with torch.no_grad(): + for batch_idx, batch in tqdm( + enumerate(dataloader), + desc=part, + total=len(dataloader), + ): + batch = self.process_batch(batch, is_train=False, metrics=self.evaluation_metrics, part=part, epoch=epoch) + self.writer.set_step(epoch * self.len_epoch, part) + self._log_predictions(**batch) + self._log_spectrogram(batch["spectrogram"]) + self._log_scalars(self.evaluation_metrics) + + # add histogram of model parameters to the tensorboard + # for name, p in self.model.named_parameters(): + # self.writer.add_histogram(name, p, bins="auto") + return self.evaluation_metrics.result() + + def _progress(self, batch_idx): + base = "[{}/{} ({:.0f}%)]" + if hasattr(self.train_dataloader, "n_samples"): + current = batch_idx * self.train_dataloader.batch_size + total = self.train_dataloader.n_samples + else: + current = batch_idx + total = self.len_epoch + return base.format(current, total, 100.0 * current / total) + + def _log_predictions( + self, + text, + logits, + log_probs, + log_probs_length, + audio_path, + audio, + examples_to_log=10, + *args, + **kwargs, + ): + # TODO: implement logging of beam search results + if self.writer is None: + return + + ids = np.random.choice(len(text), examples_to_log, replace=False) + text = [text[i] for i in ids] + logits = logits[ids] + log_probs = log_probs[ids] + log_probs_length = log_probs_length[ids] + audio_path = [audio_path[i] for i in ids] + audio = [audio[i] for i in ids] + + argmax_inds = log_probs.cpu().argmax(-1).numpy() + argmax_inds = [inds[: int(ind_len)] for inds, ind_len in zip(argmax_inds, log_probs_length.numpy())] + argmax_texts_raw = [self.text_encoder.decode(inds) for inds in argmax_inds] + argmax_texts = [self.text_encoder.ctc_decode(inds) for inds in argmax_inds] + + probs = np.exp(log_probs.detach().cpu().numpy()) + probs_length = log_probs_length.detach().cpu().numpy() + bs_preds = [self.text_encoder.ctc_beam_search(prob[:prob_length], 4) for prob, prob_length in zip(probs, probs_length)] + + logits = logits.detach().cpu().numpy() + lm_preds = [self.text_encoder.ctc_lm_beam_search(logit[:length]) for logit, length in zip(logits, probs_length)] + + tuples = list(zip(argmax_texts, bs_preds, lm_preds, text, argmax_texts_raw, audio_path, audio)) + rows = {} + for pred, bs_pred, lm_pred, target, raw_pred, audio_path, audio in tuples: + target = BaseTextEncoder.normalize_text(target) + wer = calc_wer(target, pred) * 100 + cer = calc_cer(target, pred) * 100 + + bs_wer = calc_wer(target, bs_pred) * 100 + bs_cer = calc_cer(target, bs_pred) * 100 + + lm_wer = calc_wer(target, lm_pred) * 100 + lm_cer = calc_cer(target, lm_pred) * 100 + + rows[Path(audio_path).name] = { + "orig_audio": self.writer.wandb.Audio(audio_path), # inaccurate, but no changes in the template + "augm_audio": self.writer.wandb.Audio(audio.squeeze().numpy(), sample_rate=16000), # inaccurate, but no changes in the template + "target": target, + "raw pred": raw_pred, + "pred": pred, + "bs pred": bs_pred, + "lm pred": lm_pred, + "wer": wer, + "cer": cer, + "bs wer": bs_wer, + "bs cer": bs_cer, + "lm wer": lm_wer, + "lm cer": lm_cer, + } + self.writer.add_table("predictions", pd.DataFrame.from_dict(rows, orient="index")) + + def _log_spectrogram(self, spectrogram_batch): + spectrogram = random.choice(spectrogram_batch.cpu()) + image = PIL.Image.open(plot_spectrogram_to_buf(spectrogram)) + self.writer.add_image("spectrogram", ToTensor()(image)) + + @torch.no_grad() + def get_grad_norm(self, norm_type=2): + parameters = self.model.parameters() + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = [p for p in parameters if p.grad is not None] + total_norm = torch.norm( + torch.stack([torch.norm(p.grad.detach(), norm_type).cpu() for p in parameters]), + norm_type, + ) + return total_norm.item() + + def _log_scalars(self, metric_tracker: MetricTracker): + if self.writer is None: + return + for metric_name in metric_tracker.keys(): + self.writer.add_scalar(f"{metric_name}", metric_tracker.avg(metric_name)) diff --git a/automatic-speech-recognition/hw_asr/utils/__init__.py b/automatic-speech-recognition/hw_asr/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..46d3a156a78c6ef994a0ba7e92a334a4a6b16b8b --- /dev/null +++ b/automatic-speech-recognition/hw_asr/utils/__init__.py @@ -0,0 +1 @@ +from .util import * diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d8ebcc423b144fd994dd785352edec675601bc7 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6be4d757ad564f9dcdf665ccef09849a1be1e8b Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-310.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb66d7f2a8fea3a2a8d2c541cce73dc539acf7fe Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-311.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..014cb4af46bfbe4ce1441a78f09fb5ad45c7767c Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-310.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..282688690bd6b55c539316af10a6b23b918a3c64 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-311.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9c32fb1cd1d1717df562665566888553a6e1867 Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-310.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aeb0f4198abbe5473f3282ee9095be432fc4baae Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-310.pyc differ diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-311.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8566bd2b51c23eac3f07533d45fec0a4396e8b6d Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-311.pyc differ diff --git a/automatic-speech-recognition/hw_asr/utils/object_loading.py b/automatic-speech-recognition/hw_asr/utils/object_loading.py new file mode 100644 index 0000000000000000000000000000000000000000..0107c56f17cdd288a6139e1d6a1ae6bfa28cbca3 --- /dev/null +++ b/automatic-speech-recognition/hw_asr/utils/object_loading.py @@ -0,0 +1,63 @@ +from operator import xor + +from torch.utils.data import ConcatDataset, DataLoader + +import hw_asr.augmentations +import hw_asr.datasets +from hw_asr import batch_sampler as batch_sampler_module +from hw_asr.base.base_text_encoder import BaseTextEncoder +from hw_asr.collate_fn.collate import collate_fn +from hw_asr.utils.parse_config import ConfigParser + + +def get_dataloaders(configs: ConfigParser, text_encoder: BaseTextEncoder): + dataloaders = {} + for split, params in configs["data"].items(): + num_workers = params.get("num_workers", 1) + + # set train augmentations + if split == 'train': + wave_augs, spec_augs = hw_asr.augmentations.from_configs(configs) + drop_last = True + else: + wave_augs, spec_augs = None, None + drop_last = False + + # create and join datasets + datasets = [] + for ds in params["datasets"]: + datasets.append(configs.init_obj( + ds, hw_asr.datasets, text_encoder=text_encoder, config_parser=configs, + wave_augs=wave_augs, spec_augs=spec_augs)) + assert len(datasets) + if len(datasets) > 1: + dataset = ConcatDataset(datasets) + else: + dataset = datasets[0] + + # select batch size or batch sampler + assert xor("batch_size" in params, "batch_sampler" in params), \ + "You must provide batch_size or batch_sampler for each split" + if "batch_size" in params: + bs = params["batch_size"] + shuffle = True + batch_sampler = None + elif "batch_sampler" in params: + batch_sampler = configs.init_obj(params["batch_sampler"], batch_sampler_module, + data_source=dataset) + bs, shuffle = 1, False + else: + raise Exception() + + # Fun fact. An hour of debugging was wasted to write this line + assert bs <= len(dataset), \ + f"Batch size ({bs}) shouldn't be larger than dataset length ({len(dataset)})" + + # create dataloader + dataloader = DataLoader( + dataset, batch_size=bs, collate_fn=collate_fn, + shuffle=shuffle, num_workers=num_workers, + batch_sampler=batch_sampler, drop_last=drop_last + ) + dataloaders[split] = dataloader + return dataloaders diff --git a/automatic-speech-recognition/hw_asr/utils/parse_config.py b/automatic-speech-recognition/hw_asr/utils/parse_config.py new file mode 100644 index 0000000000000000000000000000000000000000..51e8f2d4f2137d79a96e395b6db601887414d31b --- /dev/null +++ b/automatic-speech-recognition/hw_asr/utils/parse_config.py @@ -0,0 +1,203 @@ +import importlib +import json +import logging +import os +from datetime import datetime +from functools import reduce, partial +from operator import getitem +from pathlib import Path + +from hw_asr import text_encoder as text_encoder_module +from hw_asr.base.base_text_encoder import BaseTextEncoder +from hw_asr.logger import setup_logging +from hw_asr.text_encoder import CTCCharTextEncoder +from hw_asr.utils import read_json, write_json, ROOT_PATH + + +class ConfigParser: + def __init__(self, config, resume=None, modification=None, run_id=None): + """ + class to parse configuration json file. Handles hyperparameters for training, + initializations of modules, checkpoint saving and logging module. + :param config: Dict containing configurations, hyperparameters for training. + contents of `config.json` file for example. + :param resume: String, path to the checkpoint being loaded. + :param modification: Dict {keychain: value}, specifying position values to be replaced + from config dict. + :param run_id: Unique Identifier for training processes. + Used to save checkpoints and training log. Timestamp is being used as default + """ + # load config file and apply modification + self._config = _update_config(config, modification) + self.resume = resume + self._text_encoder = None + + # set save_dir where trained model and log will be saved. + save_dir = Path(self.config["trainer"]["save_dir"]) + + exper_name = self.config["name"] + if run_id is None: # use timestamp as default run-id + run_id = datetime.now().strftime(r"%m%d_%H%M%S") + self._save_dir = str(save_dir / "models" / exper_name / run_id) + self._log_dir = str(save_dir / "log" / exper_name / run_id) + + # make directory for saving checkpoints and log. + exist_ok = run_id == "" + self.save_dir.mkdir(parents=True, exist_ok=exist_ok) + self.log_dir.mkdir(parents=True, exist_ok=exist_ok) + + # save updated config file to the checkpoint dir + write_json(self.config, self.save_dir / "config.json") + + # configure logging module + setup_logging(self.log_dir) + self.log_levels = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG} + + @classmethod + def from_args(cls, args, options=""): + """ + Initialize this class from some cli arguments. Used in train, test. + """ + for opt in options: + args.add_argument(*opt.flags, default=None, type=opt.type) + if not isinstance(args, tuple): + args = args.parse_args() + + if args.device is not None: + os.environ["CUDA_VISIBLE_DEVICES"] = args.device + if args.resume is not None: + resume = Path(args.resume) + cfg_fname = resume.parent / "config.json" + else: + msg_no_cfg = "Configuration file need to be specified. " \ + "Add '-c config.json', for example." + assert args.config is not None, msg_no_cfg + resume = None + cfg_fname = Path(args.config) + + config = read_json(cfg_fname) + if args.config and resume: + # update new config for fine-tuning + config.update(read_json(args.config)) + + # parse custom cli options into dictionary + modification = { + opt.target: getattr(args, _get_opt_name(opt.flags)) for opt in options + } + return cls(config, resume, modification) + + @staticmethod + def init_obj(obj_dict, default_module, *args, **kwargs): + """ + Finds a function handle with the name given as 'type' in config, and returns the + instance initialized with corresponding arguments given. + + `object = config.init_obj(config['param'], module, a, b=1)` + is equivalent to + `object = module.name(a, b=1)` + """ + if "module" in obj_dict: + default_module = importlib.import_module(obj_dict["module"]) + + module_name = obj_dict["type"] + module_args = dict(obj_dict["args"]) + assert all( + [k not in module_args for k in kwargs] + ), "Overwriting kwargs given in config file is not allowed" + module_args.update(kwargs) + return getattr(default_module, module_name)(*args, **module_args) + + def init_ftn(self, name, module, *args, **kwargs): + """ + Finds a function handle with the name given as 'type' in config, and returns the + function with given arguments fixed with functools.partial. + + `function = config.init_ftn('name', module, a, b=1)` + is equivalent to + `function = lambda *args, **kwargs: module.name(a, *args, b=1, **kwargs)`. + """ + module_name = self[name]["type"] + module_args = dict(self[name]["args"]) + assert all( + [k not in module_args for k in kwargs] + ), "Overwriting kwargs given in config file is not allowed" + module_args.update(kwargs) + return partial(getattr(module, module_name), *args, **module_args) + + def __getitem__(self, name): + """Access items like ordinary dict.""" + return self.config[name] + + def get_logger(self, name, verbosity=2): + msg_verbosity = "verbosity option {} is invalid. Valid options are {}.".format( + verbosity, self.log_levels.keys() + ) + assert verbosity in self.log_levels, msg_verbosity + logger = logging.getLogger(name) + logger.setLevel(self.log_levels[verbosity]) + return logger + + def get_text_encoder(self) -> BaseTextEncoder: + if self._text_encoder is None: + if "text_encoder" not in self._config: + self._text_encoder = CTCCharTextEncoder() + elif self._config["text_encoder"] == "CTCCharTextEncoder": + self._text_encoder = CTCCharTextEncoder(self._config["text_encoder"]["args"]) + else: + self._text_encoder = self.init_obj(self["text_encoder"], + default_module=text_encoder_module) + return self._text_encoder + + # setting read-only attributes + @property + def config(self): + return self._config + + @property + def save_dir(self): + return Path(self._save_dir) + + @property + def log_dir(self): + return Path(self._log_dir) + + @classmethod + def get_default_configs(cls): + config_path = ROOT_PATH / "hw_asr" / "config.json" + with config_path.open() as f: + return cls(json.load(f)) + + @classmethod + def get_test_configs(cls): + config_path = ROOT_PATH / "hw_asr" / "tests" / "config.json" + with config_path.open() as f: + return cls(json.load(f)) + + +# helper functions to update config dict with custom cli options +def _update_config(config, modification): + if modification is None: + return config + + for k, v in modification.items(): + if v is not None: + _set_by_path(config, k, v) + return config + + +def _get_opt_name(flags): + for flg in flags: + if flg.startswith("--"): + return flg.replace("--", "") + return flags[0].replace("--", "") + + +def _set_by_path(tree, keys, value): + """Set a value in a nested object in tree by sequence of keys.""" + keys = keys.split(";") + _get_by_path(tree, keys[:-1])[keys[-1]] = value + + +def _get_by_path(tree, keys): + """Access a nested object in tree by sequence of keys.""" + return reduce(getitem, keys, tree) diff --git a/automatic-speech-recognition/hw_asr/utils/util.py b/automatic-speech-recognition/hw_asr/utils/util.py new file mode 100644 index 0000000000000000000000000000000000000000..9f50d31451d689fef13dacbd33892ad02794851c --- /dev/null +++ b/automatic-speech-recognition/hw_asr/utils/util.py @@ -0,0 +1,82 @@ +import json +from collections import OrderedDict +from itertools import repeat +from pathlib import Path + +import pandas as pd +import torch + +ROOT_PATH = Path(__file__).absolute().resolve().parent.parent.parent + + +def ensure_dir(dirname): + dirname = Path(dirname) + if not dirname.is_dir(): + dirname.mkdir(parents=True, exist_ok=False) + + +def read_json(fname): + fname = Path(fname) + with fname.open("rt") as handle: + return json.load(handle, object_hook=OrderedDict) + + +def write_json(content, fname): + fname = Path(fname) + with fname.open("wt") as handle: + json.dump(content, handle, indent=4, sort_keys=False) + + +def inf_loop(data_loader): + """wrapper function for endless data loader.""" + for loader in repeat(data_loader): + yield from loader + + +def prepare_device(n_gpu_use): + """ + setup GPU device if available. get gpu device indices which are used for DataParallel + """ + n_gpu = torch.cuda.device_count() + if n_gpu_use > 0 and n_gpu == 0: + print( + "Warning: There's no GPU available on this machine," + "training will be performed on CPU." + ) + n_gpu_use = 0 + if n_gpu_use > n_gpu: + print( + f"Warning: The number of GPU's configured to use is {n_gpu_use}, but only {n_gpu} are " + "available on this machine." + ) + n_gpu_use = n_gpu + device = torch.device("cuda:0" if n_gpu_use > 0 else "cpu") + list_ids = list(range(n_gpu_use)) + return device, list_ids + + +class MetricTracker: + def __init__(self, *keys, writer=None): + self.writer = writer + self._data = pd.DataFrame(index=keys, columns=["total", "counts", "average"]) + self.reset() + + def reset(self): + for col in self._data.columns: + self._data[col].values[:] = 0 + + def update(self, key, value, n=1): + # if self.writer is not None: + # self.writer.add_scalar(key, value) + self._data.total[key] += value * n + self._data.counts[key] += n + self._data.average[key] = self._data.total[key] / self._data.counts[key] + + def avg(self, key): + return self._data.average[key] + + def result(self): + return dict(self._data.average) + + def keys(self): + return self._data.total.keys() diff --git a/automatic-speech-recognition/requirements.txt b/automatic-speech-recognition/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad39fc449461551a1d1cdea08775d71c2266d9ea --- /dev/null +++ b/automatic-speech-recognition/requirements.txt @@ -0,0 +1,17 @@ +torch==2.1.0 +torchvision==0.16.0 +numpy +tqdm +tensorboard +matplotlib +pandas + +speechbrain==0.5.15 +datasets +torch_audiomentations +editdistance +wandb +pyctcdecode +torchaudio==2.1.0 +pillow +kenlm \ No newline at end of file diff --git a/automatic-speech-recognition/test.py b/automatic-speech-recognition/test.py new file mode 100644 index 0000000000000000000000000000000000000000..550eaec2dfee8d30c299707326746d61c7195f61 --- /dev/null +++ b/automatic-speech-recognition/test.py @@ -0,0 +1,202 @@ +import argparse +import json +import os +from pathlib import Path + +import torch +from tqdm import tqdm + +import hw_asr.model as module_model +from hw_asr.trainer import Trainer +from hw_asr.utils import ROOT_PATH +from hw_asr.utils.object_loading import get_dataloaders +from hw_asr.utils.parse_config import ConfigParser +from hw_asr.metric.utils import calc_wer + +DEFAULT_CHECKPOINT_PATH = ROOT_PATH / "default_test_model" / "checkpoint.pth" + + +def main(config, out_file): + logger = config.get_logger("test") + + # define cpu or gpu if possible + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # text_encoder + text_encoder = config.get_text_encoder() + + # setup data_loader instances + dataloaders = get_dataloaders(config, text_encoder) + + # build model architecture + model = config.init_obj(config["arch"], module_model, n_class=len(text_encoder)) + logger.info(model) + + logger.info("Loading checkpoint: {} ...".format(config.resume)) + checkpoint = torch.load(config.resume, map_location=device) + state_dict = checkpoint["state_dict"] + if config["n_gpu"] > 1: + model = torch.nn.DataParallel(model) + model.load_state_dict(state_dict) + + # prepare model for testing + model = model.to(device) + model.eval() + + results = [] + + argmax_wer_sum = 0 + beam_search_wer_sum = 0 + lm_wer_sum = 0 + + with torch.no_grad(): + for batch_num, batch in enumerate(tqdm(dataloaders["test"])): + batch = Trainer.move_batch_to_device(batch, device) + output = model(**batch) + if type(output) is dict: + batch.update(output) + else: + batch["logits"] = output + batch["log_probs"] = torch.log_softmax(batch["logits"], dim=-1) + batch["log_probs_length"] = model.transform_input_lengths(batch["spectrogram_length"]) + batch["probs"] = batch["log_probs"].exp().cpu() + batch["argmax"] = batch["probs"].argmax(-1) + for i in range(len(batch["text"])): + length = int(batch["log_probs_length"][i]) + ground_truth = batch["text"][i] + + argmax = batch["argmax"][i][:length].cpu().numpy() + text_argmax = text_encoder.ctc_decode(argmax) + + probs = batch["probs"][i][:length].detach().cpu().numpy() + text_beam_search = text_encoder.ctc_beam_search(probs, beam_size=4) + + logits = batch["logits"][i][:length].detach().cpu().numpy() + text_lm = text_encoder.ctc_lm_beam_search(logits) + + argmax_wer = calc_wer(ground_truth, text_argmax) * 100 + beam_search_wer = calc_wer(ground_truth, text_beam_search) * 100 + lm_wer = calc_wer(ground_truth, text_lm) * 100 + + argmax_wer_sum += argmax_wer + beam_search_wer_sum += beam_search_wer + lm_wer_sum += lm_wer + + results.append( + { + "ground_truth": ground_truth, + "pred_text_argmax": text_argmax, + "pred_text_beam_search": text_beam_search, + "pred_text_lm": text_lm, + "argmax_wer": argmax_wer, + "beam_search_wer": beam_search_wer, + "lm_wer": lm_wer, + } + ) + + n = len(results) + logger.info("argmax_wer_mean:") + logger.info(argmax_wer_sum / n) + logger.info("beam_search_wer_mean:") + logger.info(beam_search_wer_sum / n) + logger.info("lm_wer_mean:") + logger.info(lm_wer_sum / n) + + with Path(out_file).open("w") as f: + json.dump(results, f, indent=2) + + +if __name__ == "__main__": + args = argparse.ArgumentParser(description="PyTorch Template") + args.add_argument( + "-c", + "--config", + default=None, + type=str, + help="config file path (default: None)", + ) + args.add_argument( + "-r", + "--resume", + default=str(DEFAULT_CHECKPOINT_PATH.absolute().resolve()), + type=str, + help="path to latest checkpoint (default: None)", + ) + args.add_argument( + "-d", + "--device", + default=None, + type=str, + help="indices of GPUs to enable (default: all)", + ) + args.add_argument( + "-o", + "--output", + default="output.json", + type=str, + help="File to write results (.json)", + ) + args.add_argument( + "-t", + "--test-data-folder", + default=None, + type=str, + help="Path to dataset", + ) + args.add_argument( + "-b", + "--batch-size", + default=20, + type=int, + help="Test dataset batch size", + ) + args.add_argument( + "-j", + "--jobs", + default=1, + type=int, + help="Number of workers for test dataloader", + ) + + args = args.parse_args() + + # set GPUs + if args.device is not None: + os.environ["CUDA_VISIBLE_DEVICES"] = args.device + + # first, we need to obtain config with model parameters + # we assume it is located with checkpoint in the same folder + model_config = Path(args.resume).parent / "config.json" + with model_config.open() as f: + config = ConfigParser(json.load(f), resume=args.resume) + + # update with addition configs from `args.config` if provided + if args.config is not None: + with Path(args.config).open() as f: + config.config.update(json.load(f)) + + # if `--test-data-folder` was provided, set it as a default test set + if args.test_data_folder is not None: + test_data_folder = Path(args.test_data_folder).absolute().resolve() + assert test_data_folder.exists() + config.config["data"] = { + "test": { + "batch_size": args.batch_size, + "num_workers": args.jobs, + "datasets": [ + { + "type": "CustomDirAudioDataset", + "args": { + "audio_dir": str(test_data_folder / "audio"), + "transcription_dir": str(test_data_folder / "transcriptions"), + }, + } + ], + } + } + + assert config.config.get("data", {}).get("test", None) is not None + config["data"]["test"]["batch_size"] = args.batch_size + config["data"]["test"]["n_jobs"] = args.jobs + + main(config, args.output) diff --git a/automatic-speech-recognition/train.py b/automatic-speech-recognition/train.py new file mode 100644 index 0000000000000000000000000000000000000000..b46211f809d7c8cc146b8a1a62ea488ed5a7ee29 --- /dev/null +++ b/automatic-speech-recognition/train.py @@ -0,0 +1,107 @@ +import argparse +import collections +import warnings + +import numpy as np +import torch + +import hw_asr.loss as module_loss +import hw_asr.metric as module_metric +import hw_asr.model as module_arch +from hw_asr.trainer import Trainer +from hw_asr.utils import prepare_device +from hw_asr.utils.object_loading import get_dataloaders +from hw_asr.utils.parse_config import ConfigParser + +warnings.filterwarnings("ignore", category=UserWarning) + +# fix random seeds for reproducibility +SEED = 123 +torch.manual_seed(SEED) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False +np.random.seed(SEED) + + +def main(config): + logger = config.get_logger("train") + + # text_encoder + text_encoder = config.get_text_encoder() + + # setup data_loader instances + dataloaders = get_dataloaders(config, text_encoder) + + # build model architecture, then print to console + model = config.init_obj(config["arch"], module_arch, n_class=len(text_encoder)) + logger.info(model) + + # prepare for (multi-device) GPU training + device, device_ids = prepare_device(config["n_gpu"]) + model = model.to(device) + if len(device_ids) > 1: + model = torch.nn.DataParallel(model, device_ids=device_ids) + + # get function handles of loss and metrics + loss_module = config.init_obj(config["loss"], module_loss).to(device) + metrics = [ + config.init_obj(metric_dict, module_metric, text_encoder=text_encoder) + for metric_dict in config["metrics"] + ] + + # build optimizer, learning rate scheduler. delete every line containing lr_scheduler for + # disabling scheduler + trainable_params = filter(lambda p: p.requires_grad, model.parameters()) + optimizer = config.init_obj(config["optimizer"], torch.optim, trainable_params) + lr_scheduler = config.init_obj(config["lr_scheduler"], torch.optim.lr_scheduler, optimizer) + + trainer = Trainer( + model, + loss_module, + metrics, + optimizer, + text_encoder=text_encoder, + config=config, + device=device, + dataloaders=dataloaders, + lr_scheduler=lr_scheduler, + len_epoch=config["trainer"].get("len_epoch", None) + ) + + trainer.train() + + +if __name__ == "__main__": + args = argparse.ArgumentParser(description="PyTorch Template") + args.add_argument( + "-c", + "--config", + default=None, + type=str, + help="config file path (default: None)", + ) + args.add_argument( + "-r", + "--resume", + default=None, + type=str, + help="path to latest checkpoint (default: None)", + ) + args.add_argument( + "-d", + "--device", + default=None, + type=str, + help="indices of GPUs to enable (default: all)", + ) + + # custom cli options to modify configuration from default values given in json file. + CustomArgs = collections.namedtuple("CustomArgs", "flags type target") + options = [ + CustomArgs(["--lr", "--learning_rate"], type=float, target="optimizer;args;lr"), + CustomArgs( + ["--bs", "--batch_size"], type=int, target="data_loader;args;batch_size" + ), + ] + config = ConfigParser.from_args(args, options) + main(config)