diff --git a/automatic-speech-recognition/Dockerfile b/automatic-speech-recognition/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f2d36ccf9701bda2c40a81fae8ae064c45402b13
--- /dev/null
+++ b/automatic-speech-recognition/Dockerfile
@@ -0,0 +1,15 @@
+FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-devel
+WORKDIR /repos/asr_project_template
+
+# Install requirements for torchaudio
+RUN pip install sox && conda install torchaudio==0.11.0 -c pytorch && conda install -c conda-forge librosa
+
+# Install requirements
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+
+# Copy the contents of repository
+COPY . .
+
+# Expose port
+EXPOSE 3000
\ No newline at end of file
diff --git a/automatic-speech-recognition/LICENSE b/automatic-speech-recognition/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..bb62e4c42ecc1094e674db1a0c98696bb7e48ef4
--- /dev/null
+++ b/automatic-speech-recognition/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Daniil Ivanov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/automatic-speech-recognition/README.md b/automatic-speech-recognition/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dca3a3410855cf9e04f1c3b0dcadfc24a4adeb0
--- /dev/null
+++ b/automatic-speech-recognition/README.md
@@ -0,0 +1,21 @@
+# ASR project barebones
+
+## Installation guide
+
+1. `pip install -r ./requirements.txt`
+2. Download from http://www.openslr.org/11/ `3-gram.arpa.gz` and `librispeech-vocab.txt`
+3. `python hw_asr/text_encoder/fix_vocab.py` and `python hw_asr/text_encoder/lower_model.py` to prepare vocab and model for using
+4. If you want to test my model, download it from the https://drive.google.com/file/d/1QrSsx56V5YNjGHUBWy6CIRVbNbjKWUpJ/view?usp=share_link , name it `checkpoint.pth` and place to the directory `default_test_model/`
+
+## Train
+
+1. `python train.py --config hw_asr/configs/config2.json`
+
+## Test
+
+1. `python test.py -c default_test_config.json -r default_test_model/checkpoint.pth`
+
+## Wandb report
+
+1. You can check my wandb report (only on Russian) and wandb project from the https://wandb.ai/tgritsaev/asr_project/reports/DLA-HW-1--Vmlldzo1NzY3NjA5?accessToken=kotkj5oyzomf2d2g1f40mczdnpirwvuw1f538zx9k491g1cfh3wg9iwhsb65o054
+
diff --git a/automatic-speech-recognition/checkpoint.pth b/automatic-speech-recognition/checkpoint.pth
new file mode 100644
index 0000000000000000000000000000000000000000..daf31cbefb1e62f163627551c41883c4982ab1d9
--- /dev/null
+++ b/automatic-speech-recognition/checkpoint.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f1bebf1c95bb69c3130757b652c9fa4a975302b142a0b3d779d36ba404905ac
+size 333205079
diff --git a/automatic-speech-recognition/default_test_config.json b/automatic-speech-recognition/default_test_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd875a1e4926c8567a262c4148a9594db008134d
--- /dev/null
+++ b/automatic-speech-recognition/default_test_config.json
@@ -0,0 +1,188 @@
+{
+    "name": "default_test_config",
+    "n_gpu": 1,
+    "text_encoder": {
+        "type": "CTCCharTextEncoder",
+        "args": {
+            "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa",
+            "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt"
+        }
+    },
+    "preprocessing": {
+        "sr": 16000,
+        "spectrogram": {
+            "type": "MelSpectrogram",
+            "args": {
+                "n_mels": 256
+            }
+        },
+        "log_spec": true
+    },
+    "augmentations": {
+        "random_apply_p": 0.6,
+        "wave": [
+            {
+                "type": "AddColoredNoise",
+                "args": {
+                    "p": 1,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "Gain",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "HighPassFilter",
+                "args": {
+                    "p": 0,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "LowPassFilter",
+                "args": {
+                    "p": 0,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "PitchShift",
+                "args": {
+                    "p": 0.8,
+                    "min_transpose_semitones": -2,
+                    "max_transpose_semitones": 2,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "PolarityInversion",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "Shift",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            }
+        ],
+        "spectrogram": [
+            {
+                "type": "TimeMasking",
+                "args": {
+                    "time_mask_param": 80,
+                    "p": 0.05
+                }
+            },
+            {
+                "type": "FrequencyMasking",
+                "args": {
+                    "freq_mask_param": 80
+                }
+            }
+        ]
+    },
+    "arch": {
+        "type": "DeepSpeech2Model",
+        "args": {
+            "n_feats": 256,
+            "n_rnn_layers": 6,
+            "rnn_hidden_size": 512,
+            "rnn_dropout": 0.2
+        }
+    },
+    "data": {
+        "test": {
+            "batch_size": 64,
+            "num_workers": 4,
+            "datasets": [
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "test-other"
+                    }
+                }
+            ]
+        }
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "args": {
+            "lr": 0.0003,
+            "weight_decay": 1e-05
+        }
+    },
+    "loss": {
+        "type": "CTCLoss",
+        "args": {}
+    },
+    "metrics": [
+        {
+            "type": "ArgmaxWERMetric",
+            "args": {
+                "name": "WER (argmax)"
+            }
+        },
+        {
+            "type": "ArgmaxCERMetric",
+            "args": {
+                "name": "CER (argmax)"
+            }
+        },
+        {
+            "type": "BeamSearchWERMetric",
+            "args": {
+                "beam_size": 4,
+                "name": "WER (beam search)"
+            }
+        },
+        {
+            "type": "BeamSearchCERMetric",
+            "args": {
+                "beam_size": 4,
+                "name": "CER (beam search)"
+            }
+        },
+        {
+            "type": "LanguageModelWERMetric",
+            "args": {
+                "name": "WER (LM)"
+            }
+        },
+        {
+            "type": "LanguageModelCERMetric",
+            "args": {
+                "name": "CER (LM)"
+            }
+        }
+    ],
+    "lr_scheduler": {
+        "type": "OneCycleLR",
+        "args": {
+            "steps_per_epoch": 1000,
+            "epochs": 50,
+            "anneal_strategy": "cos",
+            "max_lr": 0.0003,
+            "pct_start": 0.1
+        }
+    },
+    "trainer": {
+        "epochs": 50,
+        "save_dir": "saved/",
+        "save_period": 5,
+        "verbosity": 2,
+        "monitor": "min val_loss",
+        "early_stop": 100,
+        "visualize": "wandb",
+        "wandb_project": "asr_project",
+        "len_epoch": 1000,
+        "grad_norm_clip": 10
+    }
+}
\ No newline at end of file
diff --git a/automatic-speech-recognition/default_test_model/config.json b/automatic-speech-recognition/default_test_model/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e612026c1295aff58f8af408041e8178a8d7893b
--- /dev/null
+++ b/automatic-speech-recognition/default_test_model/config.json
@@ -0,0 +1,242 @@
+{
+    "name": "default_config",
+    "n_gpu": 1,
+    "text_encoder": {
+        "type": "CTCCharTextEncoder",
+        "args": {
+            "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa",
+            "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt"
+        }
+    },
+    "preprocessing": {
+        "sr": 16000,
+        "spectrogram": {
+            "type": "MelSpectrogram",
+            "args": {
+                "n_mels": 256
+            }
+        },
+        "log_spec": true
+    },
+    "augmentations": {
+        "random_apply_p": 0.6,
+        "wave": [
+            {
+                "type": "AddColoredNoise",
+                "args": {
+                    "p": 1,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "Gain",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "HighPassFilter",
+                "args": {
+                    "p": 0,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "LowPassFilter",
+                "args": {
+                    "p": 0,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "PitchShift",
+                "args": {
+                    "p": 0.8,
+                    "min_transpose_semitones": -2,
+                    "max_transpose_semitones": 2,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "PolarityInversion",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "Shift",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            }
+        ],
+        "spectrogram": [
+            {
+                "type": "TimeMasking",
+                "args": {
+                    "time_mask_param": 80,
+                    "p": 0.05
+                }
+            },
+            {
+                "type": "FrequencyMasking",
+                "args": {
+                    "freq_mask_param": 80
+                }
+            }
+        ]
+    },
+    "arch": {
+        "type": "DeepSpeech2Model",
+        "args": {
+            "n_feats": 256,
+            "n_rnn_layers": 6,
+            "rnn_hidden_size": 512,
+            "rnn_dropout": 0.2
+        }
+    },
+    "data": {
+        "train": {
+            "batch_size": 128,
+            "num_workers": 4,
+            "datasets": [
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "train-clean-100",
+                        "max_audio_length": 40.0,
+                        "max_text_length": 400
+                    }
+                },
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "train-clean-360",
+                        "max_audio_length": 40.0,
+                        "max_text_length": 400
+                    }
+                },
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "train-other-500",
+                        "max_audio_length": 40.0,
+                        "max_text_length": 400
+                    }
+                }
+            ]
+        },
+        "val": {
+            "batch_size": 64,
+            "num_workers": 4,
+            "datasets": [
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "dev-clean"
+                    }
+                }
+            ]
+        },
+        "test-other": {
+            "batch_size": 64,
+            "num_workers": 4,
+            "datasets": [
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "test-other"
+                    }
+                }
+            ]
+        },
+        "test-clean": {
+            "batch_size": 64,
+            "num_workers": 4,
+            "datasets": [
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "test-clean"
+                    }
+                }
+            ]
+        }
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "args": {
+            "lr": 0.0003,
+            "weight_decay": 1e-05
+        }
+    },
+    "loss": {
+        "type": "CTCLoss",
+        "args": {}
+    },
+    "metrics": [
+        {
+            "type": "ArgmaxWERMetric",
+            "args": {
+                "name": "WER (argmax)"
+            }
+        },
+        {
+            "type": "ArgmaxCERMetric",
+            "args": {
+                "name": "CER (argmax)"
+            }
+        },
+        {
+            "type": "BeamSearchWERMetric",
+            "args": {
+                "beam_size": 4,
+                "name": "WER (beam search)"
+            }
+        },
+        {
+            "type": "BeamSearchCERMetric",
+            "args": {
+                "beam_size": 4,
+                "name": "CER (beam search)"
+            }
+        },
+        {
+            "type": "LanguageModelWERMetric",
+            "args": {
+                "name": "WER (LM)"
+            }
+        },
+        {
+            "type": "LanguageModelCERMetric",
+            "args": {
+                "name": "CER (LM)"
+            }
+        }
+    ],
+    "lr_scheduler": {
+        "type": "OneCycleLR",
+        "args": {
+            "steps_per_epoch": 1000,
+            "epochs": 50,
+            "anneal_strategy": "cos",
+            "max_lr": 0.0003,
+            "pct_start": 0.1
+        }
+    },
+    "trainer": {
+        "epochs": 50,
+        "save_dir": "saved/",
+        "save_period": 5,
+        "verbosity": 2,
+        "monitor": "min val_loss",
+        "early_stop": 100,
+        "visualize": "wandb",
+        "wandb_project": "asr_project",
+        "len_epoch": 1000,
+        "grad_norm_clip": 10
+    }
+}
\ No newline at end of file
diff --git a/automatic-speech-recognition/hw_asr/__init__.py b/automatic-speech-recognition/hw_asr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ca1e3113f0f8798d4b118037aee5326ef566aa5
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cafeb295d339a2b7caa7ffe80b365cf5335278f9
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__init__.py b/automatic-speech-recognition/hw_asr/augmentations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b2633e40c374f42c7dd9abd385a2d51705b9aae
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/__init__.py
@@ -0,0 +1,36 @@
+from collections.abc import Callable
+from typing import List
+
+import hw_asr.augmentations.spectrogram_augmentations
+import hw_asr.augmentations.wave_augmentations
+from hw_asr.augmentations.random_choice import RandomChoice
+from hw_asr.augmentations.sequential_random_apply import SequentialRandomApply
+# from hw_asr.augmentations.sequential import SequentialAugmentation
+# from hw_asr.augmentations.random_apply import RandomApply
+from hw_asr.utils.parse_config import ConfigParser
+
+
+def from_configs(configs: ConfigParser):
+    wave_augs = []
+    if "augmentations" in configs.config and "wave" in configs.config["augmentations"]:
+        for aug_dict in configs.config["augmentations"]["wave"]:
+            wave_augs.append(
+                configs.init_obj(aug_dict, hw_asr.augmentations.wave_augmentations)
+            )
+
+    spec_augs = []
+    if "augmentations" in configs.config and "spectrogram" in configs.config["augmentations"]:
+        for aug_dict in configs.config["augmentations"]["spectrogram"]:
+            spec_augs.append(
+                configs.init_obj(aug_dict, hw_asr.augmentations.spectrogram_augmentations)
+            )
+    return _to_function(RandomChoice, wave_augs, configs.config["augmentations"]["random_apply_p"]), _to_function(SequentialRandomApply, spec_augs, configs.config["augmentations"]["random_apply_p"])
+
+
+def _to_function(random_type, augs_list: List[Callable], p: float):
+    if len(augs_list) == 0:
+        return None
+    elif len(augs_list) == 1:
+        return augs_list[0]
+    else:
+        return random_type(augs_list, p)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e048fea1094265d8f745c5da034deacf07790a21
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c39bc9c81aa2a387d268dc520ffbda88d2f4b556
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cec98a3d5c51b722ca59bb176d5e2710a0815587
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..433fc8be1385f2385e5bb9ba597d8aeeb57e0f0e
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a4b3f20c5522ef5948529c2d32dbdebe209394d
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e93d7f7a13c35970041cee925d2f3c8cf005da0
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51312b70f24eb8e9ab260f403ddfa4d86bb1e579
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2fa1af459f593cdb6f33250dd04127f3a05cab79
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d92848f8e23af02631a05da4e4a6c8dc7134dc8f
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f95b354dbffddda7d240c8d02999a2d4cc9a63bb
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b9cc0898e03570248d616145595c89041cd0714
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/base.py b/automatic-speech-recognition/hw_asr/augmentations/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..026818f5dc83d8377dab75e64ed584ca24822563
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/base.py
@@ -0,0 +1,6 @@
+from torch import Tensor
+
+
+class AugmentationBase:
+    def __call__(self, data: Tensor) -> Tensor:
+        raise NotImplementedError()
diff --git a/automatic-speech-recognition/hw_asr/augmentations/random_apply.py b/automatic-speech-recognition/hw_asr/augmentations/random_apply.py
new file mode 100644
index 0000000000000000000000000000000000000000..985a63017a2665d28150bb83b5549877f52092cc
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/random_apply.py
@@ -0,0 +1,16 @@
+import random
+from typing import Callable
+from torch import Tensor
+
+
+class RandomApply:
+    def __init__(self, augmentation: Callable, p: float):
+        assert 0 <= p <= 1
+        self.augmentation = augmentation
+        self.p = p
+
+    def __call__(self, data: Tensor) -> Tensor:
+        if random.random() < self.p:
+            return self.augmentation(data)
+        else:
+            return data
diff --git a/automatic-speech-recognition/hw_asr/augmentations/random_choice.py b/automatic-speech-recognition/hw_asr/augmentations/random_choice.py
new file mode 100644
index 0000000000000000000000000000000000000000..043ff53d6c5cd5d6bc42378078dd4e992833773d
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/random_choice.py
@@ -0,0 +1,17 @@
+from typing import List, Callable
+from torch import Tensor
+import random
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class RandomChoice(AugmentationBase):
+    def __init__(self, augmentation_list: List[Callable], p: float):
+        self.augmentation_list = augmentation_list
+        self.p = p
+
+    def __call__(self, data: Tensor) -> Tensor:
+        x = data
+        if random.random() < self.p:
+            augmentation = random.choice(self.augmentation_list)
+            x = augmentation(x)
+        return x
\ No newline at end of file
diff --git a/automatic-speech-recognition/hw_asr/augmentations/sequential.py b/automatic-speech-recognition/hw_asr/augmentations/sequential.py
new file mode 100644
index 0000000000000000000000000000000000000000..b967a3b5390a7ab6cdb1e4fd9653cee96148a5f6
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/sequential.py
@@ -0,0 +1,16 @@
+from typing import List, Callable
+
+from torch import Tensor
+
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class SequentialAugmentation(AugmentationBase):
+    def __init__(self, augmentation_list: List[Callable]):
+        self.augmentation_list = augmentation_list
+
+    def __call__(self, data: Tensor) -> Tensor:
+        x = data
+        for augmentation in self.augmentation_list:
+            x = augmentation(x)
+        return x
\ No newline at end of file
diff --git a/automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py b/automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dafca7bc00d33e5afd51e7a22530f674583d6e6
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py
@@ -0,0 +1,17 @@
+from typing import List, Callable
+from torch import Tensor
+import random
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class SequentialRandomApply(AugmentationBase):
+    def __init__(self, augmentation_list: List[Callable], p: float = 0.5):
+        self.augmentation_list = augmentation_list
+        self.p = p
+
+    def __call__(self, data: Tensor) -> Tensor:
+        x = data
+        for augmentation in self.augmentation_list:
+            if random.random() < self.p:
+                x = augmentation(x)
+        return x
\ No newline at end of file
diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a7f963e1fce42d769f0848af52782fdd7182cee
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py
@@ -0,0 +1,11 @@
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+from torchaudio import transforms
+
+
+class FrequencyMasking(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = transforms.FrequencyMasking(*args, **kwargs)
+
+    def __call__(self, spectogram: Tensor):
+        return self._aug(spectogram).squeeze(1)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc39a795698619269be7414221b630e8cd1d1f47
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py
@@ -0,0 +1,11 @@
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+from torchaudio import transforms
+
+
+class TimeMasking(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = transforms.TimeMasking(*args, **kwargs)
+
+    def __call__(self, spectogram: Tensor):
+        return self._aug(spectogram).squeeze(1)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f80941b9437c9cc495eb496f2c5a6580aa82e2e
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py
@@ -0,0 +1,7 @@
+from hw_asr.augmentations.spectrogram_augmentations.TimeMasking import TimeMasking
+from hw_asr.augmentations.spectrogram_augmentations.FrequencyMasking import FrequencyMasking
+
+__all__ = [
+    "TimeMasking",
+    "FrequencyMasking"
+]
diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cc3dddf41cd6c59b2dbe69861ae6a4a5f5a9b95
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76ef77a3f88d71c472fac33248e17ee455e75853
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d378ded94e862a2ed63a5e6e878210ecbcb61220
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..440c4cd03868c2210656a5ca815d6f0508adf612
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff654a501f5d4cf5d66b3a8cde5c6698ab41741c
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py
@@ -0,0 +1,13 @@
+import torch_audiomentations
+from torch import Tensor
+
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class AddColoredNoise(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.AddColoredNoise(*args, **kwargs)
+
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f3f6dbb3a817e0b404904419c4a6d8a3b36b26
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py
@@ -0,0 +1,13 @@
+import torch_audiomentations
+from torch import Tensor
+
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class Gain(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.Gain(*args, **kwargs)
+
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5dde3a26748384cd5f441aada4276ab96ca8828
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py
@@ -0,0 +1,13 @@
+import torch_audiomentations
+from torch import Tensor
+
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class HighPassFilter(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.HighPassFilter(*args, **kwargs)
+
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c5f58e514d3f675ab0c3532983c651dfa330de9
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py
@@ -0,0 +1,13 @@
+import torch_audiomentations
+from torch import Tensor
+
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class LowPassFilter(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.LowPassFilter(*args, **kwargs)
+
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..4008d20f71c33d143f4baec310c30017e26bfed8
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py
@@ -0,0 +1,13 @@
+import torch_audiomentations
+from torch import Tensor
+
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class Padding(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.Padding(*args, **kwargs)
+
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3cb67e5de1a4294f8e13ddcc5cda6e09465222d
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py
@@ -0,0 +1,13 @@
+import torch_audiomentations
+from torch import Tensor
+
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class PitchShift(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.PitchShift(*args, **kwargs)
+
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d2788346731e47b07286e45ad09f454dee68b2a
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py
@@ -0,0 +1,13 @@
+import torch_audiomentations
+from torch import Tensor
+
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class PolarityInversion(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.PolarityInversion(*args, **kwargs)
+
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py
new file mode 100644
index 0000000000000000000000000000000000000000..441b0187e569628a4136dbc8434d35c5028ddafb
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py
@@ -0,0 +1,13 @@
+import torch_audiomentations
+from torch import Tensor
+
+from hw_asr.augmentations.base import AugmentationBase
+
+
+class Shift(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.Shift(*args, **kwargs)
+
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..746022b0d0066dc3365e007ecabc11394cc96fea
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py
@@ -0,0 +1,19 @@
+from hw_asr.augmentations.wave_augmentations.AddColoredNoise import AddColoredNoise
+from hw_asr.augmentations.wave_augmentations.Gain import Gain
+from hw_asr.augmentations.wave_augmentations.HighPassFilter import HighPassFilter
+from hw_asr.augmentations.wave_augmentations.LowPassFilter import LowPassFilter
+# from hw_asr.augmentations.wave_augmentations.Padding import Padding
+from hw_asr.augmentations.wave_augmentations.PitchShift import PitchShift
+from hw_asr.augmentations.wave_augmentations.PolarityInversion import PolarityInversion
+from hw_asr.augmentations.wave_augmentations.Shift import Shift
+
+__all__ = [
+    "AddColoredNoise",
+    "Gain",
+    "HighPassFilter"
+    "LowPassFilter",
+    # "Padding",
+    "PitchShift",
+    "PolarityInversion",
+    "Shift"
+]
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5abf2fea3297af819e28219eee23c7ea2523172
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a7f27d9b25f976858f42a165d93159ed9b5e980
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eacf781c8875384d6705e28caf69dced4c1e3851
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f573057cde5c0e871f124da520b973ea342f9b
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cacf98fa7f2e359c5e8dd5ff0aeb5e0dea12ed4b
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..694cea18a30ed20a4c03eecab5dfe0c9fa017462
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dc950d6ba96dcf65207c4a52db7d1fe6f2fa2be
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca73b85fcd8f67117072873212d58f2d38623c54
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c9e1a440d0212ea5727fc76bf5bfd43e1c3eed5
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a78ba0726524138caba8d386fbcb9c289c420b21
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PitchShift.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..532ce7a9d1db241d58551cfd8a264cabeb37e2fc
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..856c09224efc085a52c5b7fba5f03eec8fa55fd5
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/PolarityInversion.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b12f5a231a91589d106a9975a078d2d9e69f2e32
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa26edeffe3a25565f267e982725d833111526b2
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Shift.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea0479ecf98a0349b79e573e6e5942c58098a94a
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26e317d069404da3e0227a6fbb6ef4fddd1c21f9
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__init__.py b/automatic-speech-recognition/hw_asr/base/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9d437ad774e9499e8246af6f3c1c76418f08f2c
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/base/__init__.py
@@ -0,0 +1,2 @@
+from .base_model import *
+from .base_trainer import *
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7cc6d0a6618b95ee8453115bf03a0978c6f0bbbe
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b83fc5c17c5ec9e038940080c7164961347c604
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf2fd749340c6ee9f6055ea6a61458a3323a90f0
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2590b3a1f5fa9229702508e986aada96270edbcb
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_dataset.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1efeabd3851524fa7ddc3adc7372612c1c49a0f
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1faa58ee85ee1bc46f9474a12dee9316a3b88c83
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_metric.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc77c11ae48002f45618f3393daf264a75870582
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..511cc81c3eb6a3f4d90529db1e2ce9abf2c0780a
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_model.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f267f0f66a2ad5e509efdcefd42c8e473db368f
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..388a00a9a3a4405d8c9e3015f57781d23815d0a9
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_text_encoder.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-310.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..abd1abd943d4cb76cd9e71048a191a815a8fe8c5
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-311.pyc b/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f51c655392f4209de6c6ac4b8687672b104902eb
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/base/__pycache__/base_trainer.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/base/base_dataset.py b/automatic-speech-recognition/hw_asr/base/base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2413c0139af9de3afe495d816c10d5fd5b6f6842
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/base/base_dataset.py
@@ -0,0 +1,145 @@
+import logging
+import random
+from typing import List
+
+import numpy as np
+import torch
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+
+from hw_asr.base.base_text_encoder import BaseTextEncoder
+from hw_asr.utils.parse_config import ConfigParser
+
+logger = logging.getLogger(__name__)
+
+
+class BaseDataset(Dataset):
+    def __init__(
+            self,
+            index,
+            text_encoder: BaseTextEncoder,
+            config_parser: ConfigParser,
+            wave_augs=None,
+            spec_augs=None,
+            limit=None,
+            max_audio_length=None,
+            max_text_length=None,
+    ):
+        self.text_encoder = text_encoder
+        self.config_parser = config_parser
+        self.wave_augs = wave_augs
+        self.spec_augs = spec_augs
+        self.log_spec = config_parser["preprocessing"]["log_spec"]
+
+        self._assert_index_is_valid(index)
+        index = self._filter_records_from_dataset(index, max_audio_length, max_text_length, limit)
+        # it's a good idea to sort index by audio length
+        # It would be easier to write length-based batch samplers later
+        index = self._sort_index(index)
+        self._index: List[dict] = index
+
+    def __getitem__(self, ind):
+        data_dict = self._index[ind]
+        audio_path = data_dict["path"]
+        audio_wave = self.load_audio(audio_path)
+        audio_wave, audio_spec = self.process_wave(audio_wave)
+        return {
+            "audio": audio_wave,
+            "spectrogram": audio_spec,
+            "duration": audio_wave.size(1) / self.config_parser["preprocessing"]["sr"],
+            "text": data_dict["text"],
+            "text_encoded": self.text_encoder.encode(data_dict["text"]),
+            "audio_path": audio_path,
+        }
+
+    @staticmethod
+    def _sort_index(index):
+        return sorted(index, key=lambda x: x["audio_len"])
+
+    def __len__(self):
+        return len(self._index)
+
+    def load_audio(self, path):
+        audio_tensor, sr = torchaudio.load(path)
+        audio_tensor = audio_tensor[0:1, :]  # remove all channels but the first
+        target_sr = self.config_parser["preprocessing"]["sr"]
+        if sr != target_sr:
+            audio_tensor = torchaudio.functional.resample(audio_tensor, sr, target_sr)
+        return audio_tensor
+
+    def process_wave(self, audio_tensor_wave: Tensor):
+        with torch.no_grad():
+            if self.wave_augs is not None:
+                audio_tensor_wave = self.wave_augs(audio_tensor_wave)
+            wave2spec = self.config_parser.init_obj(
+                self.config_parser["preprocessing"]["spectrogram"],
+                torchaudio.transforms,
+            )
+            audio_tensor_spec = wave2spec(audio_tensor_wave)
+            if self.spec_augs is not None:
+                audio_tensor_spec = self.spec_augs(audio_tensor_spec)
+            if self.log_spec:
+                audio_tensor_spec = torch.log(audio_tensor_spec + 1e-5)
+            return audio_tensor_wave, audio_tensor_spec
+
+    @staticmethod
+    def _filter_records_from_dataset(
+            index: list, max_audio_length, max_text_length, limit
+    ) -> list:
+        initial_size = len(index)
+        if max_audio_length is not None:
+            exceeds_audio_length = np.array([el["audio_len"] for el in index]) >= max_audio_length
+            _total = exceeds_audio_length.sum()
+            logger.info(
+                f"{_total} ({_total / initial_size:.1%}) records are longer then "
+                f"{max_audio_length} seconds. Excluding them."
+            )
+        else:
+            exceeds_audio_length = False
+
+        initial_size = len(index)
+        if max_text_length is not None:
+            exceeds_text_length = (
+                    np.array(
+                        [len(BaseTextEncoder.normalize_text(el["text"])) for el in index]
+                    )
+                    >= max_text_length
+            )
+            _total = exceeds_text_length.sum()
+            logger.info(
+                f"{_total} ({_total / initial_size:.1%}) records are longer then "
+                f"{max_text_length} characters. Excluding them."
+            )
+        else:
+            exceeds_text_length = False
+
+        records_to_filter = exceeds_text_length | exceeds_audio_length
+
+        if records_to_filter is not False and records_to_filter.any():
+            _total = records_to_filter.sum()
+            index = [el for el, exclude in zip(index, records_to_filter) if not exclude]
+            logger.info(
+                f"Filtered {_total}({_total / initial_size:.1%}) records  from dataset"
+            )
+
+        if limit is not None:
+            random.seed(42)  # best seed for deep learning
+            random.shuffle(index)
+            index = index[:limit]
+        return index
+
+    @staticmethod
+    def _assert_index_is_valid(index):
+        for entry in index:
+            assert "audio_len" in entry, (
+                "Each dataset item should include field 'audio_len'"
+                " - duration of audio (in seconds)."
+            )
+            assert "path" in entry, (
+                "Each dataset item should include field 'path'" " - path to audio file."
+            )
+            assert "text" in entry, (
+                "Each dataset item should include field 'text'"
+                " - text transcription of the audio."
+            )
diff --git a/automatic-speech-recognition/hw_asr/base/base_metric.py b/automatic-speech-recognition/hw_asr/base/base_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..8db32942c9c8755e9405dab54f79d8f99763e518
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/base/base_metric.py
@@ -0,0 +1,6 @@
+class BaseMetric:
+    def __init__(self, name=None, *args, **kwargs):
+        self.name = name if name is not None else type(self).__name__
+
+    def __call__(self, **batch):
+        raise NotImplementedError()
diff --git a/automatic-speech-recognition/hw_asr/base/base_model.py b/automatic-speech-recognition/hw_asr/base/base_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfde50c8ba874baca4e66c9eb7b9b37f10f59abe
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/base/base_model.py
@@ -0,0 +1,41 @@
+from abc import abstractmethod
+from typing import Union
+
+import numpy as np
+import torch.nn as nn
+from torch import Tensor
+
+
+class BaseModel(nn.Module):
+    """
+    Base class for all models
+    """
+
+    def __init__(self, n_feats, n_class, **batch):
+        super().__init__()
+
+    @abstractmethod
+    def forward(self, **batch) -> Union[Tensor, dict]:
+        """
+        Forward pass logic.
+        Can return a torch.Tensor (it will be interpreted as logits) or a dict.
+
+        :return: Model output
+        """
+        raise NotImplementedError()
+
+    def __str__(self):
+        """
+        Model prints with number of trainable parameters
+        """
+        model_parameters = filter(lambda p: p.requires_grad, self.parameters())
+        params = sum([np.prod(p.size()) for p in model_parameters])
+        return super().__str__() + "\nTrainable parameters: {}".format(params)
+
+    def transform_input_lengths(self, input_lengths):
+        """
+        Input length transformation function.
+        For example: if your NN transforms spectrogram of time-length `N` into an
+            output with time-length `N / 2`, then this function should return `input_lengths // 2`
+        """
+        raise NotImplementedError()
diff --git a/automatic-speech-recognition/hw_asr/base/base_text_encoder.py b/automatic-speech-recognition/hw_asr/base/base_text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..412f8cdac2d37089710712727deda33d3ed04f69
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/base/base_text_encoder.py
@@ -0,0 +1,25 @@
+import re
+from typing import List, Union
+
+import numpy as np
+from torch import Tensor
+
+
+class BaseTextEncoder:
+    def encode(self, text) -> Tensor:
+        raise NotImplementedError()
+
+    def decode(self, vector: Union[Tensor, np.ndarray, List[int]]):
+        raise NotImplementedError()
+
+    def __len__(self):
+        raise NotImplementedError()
+
+    def __getitem__(self, item: int) -> str:
+        raise NotImplementedError()
+
+    @staticmethod
+    def normalize_text(text: str):
+        text = text.lower()
+        text = re.sub(r"[^a-z ]", "", text)
+        return text
diff --git a/automatic-speech-recognition/hw_asr/base/base_trainer.py b/automatic-speech-recognition/hw_asr/base/base_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc4cf2d8a727a56c139c50b6e24d227242a16716
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/base/base_trainer.py
@@ -0,0 +1,205 @@
+from abc import abstractmethod
+
+import torch
+from numpy import inf
+
+from hw_asr.base import BaseModel
+from hw_asr.logger import get_visualizer
+
+
+class BaseTrainer:
+    """
+    Base class for all trainers
+    """
+
+    def __init__(self, model: BaseModel, criterion, metrics, optimizer, config, device):
+        self.device = device
+        self.config = config
+        self.logger = config.get_logger("trainer", config["trainer"]["verbosity"])
+
+        self.model = model
+        self.criterion = criterion
+        self.metrics = metrics
+        self.optimizer = optimizer
+
+        # for interrupt saving
+        self._last_epoch = 0
+
+        cfg_trainer = config["trainer"]
+        self.epochs = cfg_trainer["epochs"]
+        self.save_period = cfg_trainer["save_period"]
+        self.monitor = cfg_trainer.get("monitor", "off")
+
+        # configuration to monitor model performance and save best
+        if self.monitor == "off":
+            self.mnt_mode = "off"
+            self.mnt_best = 0
+        else:
+            self.mnt_mode, self.mnt_metric = self.monitor.split()
+            assert self.mnt_mode in ["min", "max"]
+
+            self.mnt_best = inf if self.mnt_mode == "min" else -inf
+            self.early_stop = cfg_trainer.get("early_stop", inf)
+            if self.early_stop <= 0:
+                self.early_stop = inf
+
+        self.start_epoch = 1
+
+        self.checkpoint_dir = config.save_dir
+
+        # setup visualization writer instance
+        self.writer = get_visualizer(
+            config, self.logger, cfg_trainer["visualize"]
+        )
+
+        if config.resume is not None:
+            self._load_model(config.resume)
+
+    @abstractmethod
+    def _train_epoch(self, epoch):
+        """
+        Training logic for an epoch
+
+        :param epoch: Current epoch number
+        """
+        raise NotImplementedError()
+
+    def train(self):
+        try:
+            self._train_process()
+        except KeyboardInterrupt as e:
+            self.logger.info("Saving model on keyboard interrupt")
+            self._save_checkpoint(self._last_epoch, save_best=False)
+            raise e
+
+    def _train_process(self):
+        """
+        Full training logic
+        """
+        not_improved_count = 0
+        for epoch in range(self.start_epoch, self.epochs + 1):
+            self._last_epoch = epoch
+            result = self._train_epoch(epoch)
+
+            # save logged informations into log dict
+            log = {"epoch": epoch}
+            log.update(result)
+
+            # print logged informations to the screen
+            for key, value in log.items():
+                self.logger.info("    {:15s}: {}".format(str(key), value))
+
+            # evaluate model performance according to configured metric,
+            # save best checkpoint as model_best
+            best = False
+            if self.mnt_mode != "off":
+                try:
+                    # check whether model performance improved or not,
+                    # according to specified metric(mnt_metric)
+                    if self.mnt_mode == "min":
+                        improved = log[self.mnt_metric] <= self.mnt_best
+                    elif self.mnt_mode == "max":
+                        improved = log[self.mnt_metric] >= self.mnt_best
+                    else:
+                        improved = False
+                except KeyError:
+                    self.logger.warning(
+                        "Warning: Metric '{}' is not found. "
+                        "Model performance monitoring is disabled.".format(
+                            self.mnt_metric
+                        )
+                    )
+                    self.mnt_mode = "off"
+                    improved = False
+
+                if improved:
+                    self.mnt_best = log[self.mnt_metric]
+                    not_improved_count = 0
+                    best = True
+                else:
+                    not_improved_count += 1
+
+                if not_improved_count > self.early_stop:
+                    self.logger.info(
+                        "Validation performance didn't improve for {} epochs. "
+                        "Training stops.".format(self.early_stop)
+                    )
+                    break
+
+            if epoch % self.save_period == 0 or best:
+                self._save_checkpoint(epoch, save_best=best, only_best=True)
+
+    def _save_checkpoint(self, epoch, save_best=False, only_best=False):
+        """
+        Saving checkpoints
+
+        :param epoch: current epoch number
+        :param save_best: if True, rename the saved checkpoint to 'model_best.pth'
+        """
+        arch = type(self.model).__name__
+        state = {
+            "arch": arch,
+            "epoch": epoch,
+            "state_dict": self.model.state_dict(),
+            "optimizer": self.optimizer.state_dict(),
+            "monitor_best": self.mnt_best,
+            "config": self.config,
+        }
+        filename = str(self.checkpoint_dir / "checkpoint-epoch{}.pth".format(epoch))
+        if not (only_best and save_best):
+            torch.save(state, filename)
+            self.logger.info("Saving checkpoint: {} ...".format(filename))
+        if save_best:
+            best_path = str(self.checkpoint_dir / "model_best.pth")
+            torch.save(state, best_path)
+            self.logger.info("Saving current best: model_best.pth ...")
+
+    def _load_model(self, resume_path):
+        """
+        Resume from saved checkpoints
+
+        :param resume_path: Checkpoint path to be resumed
+        """
+        resume_path = str(resume_path)
+        self.logger.info("Loading model: {} ...".format(resume_path))
+        checkpoint = torch.load(resume_path, self.device)
+        self.model.load_state_dict(checkpoint["state_dict"])
+        
+        self.logger.info("Model loaded")
+        
+        
+    def _resume_checkpoint(self, resume_path):
+        """
+        Resume from saved checkpoints
+
+        :param resume_path: Checkpoint path to be resumed
+        """
+        resume_path = str(resume_path)
+        self.logger.info("Loading checkpoint: {} ...".format(resume_path))
+        checkpoint = torch.load(resume_path, self.device)
+        self.start_epoch = checkpoint["epoch"] + 1
+        self.mnt_best = checkpoint["monitor_best"]
+
+        # load architecture params from checkpoint.
+        if checkpoint["config"]["arch"] != self.config["arch"]:
+            self.logger.warning(
+                "Warning: Architecture configuration given in config file is different from that "
+                "of checkpoint. This may yield an exception while state_dict is being loaded."
+            )
+        self.model.load_state_dict(checkpoint["state_dict"])
+
+        # load optimizer state from checkpoint only when optimizer type is not changed.
+        if (
+                checkpoint["config"]["optimizer"] != self.config["optimizer"] or
+                checkpoint["config"]["lr_scheduler"] != self.config["lr_scheduler"]
+        ):
+            self.logger.warning(
+                "Warning: Optimizer or lr_scheduler given in config file is different "
+                "from that of checkpoint. Optimizer parameters not being resumed."
+            )
+        else:
+            self.optimizer.load_state_dict(checkpoint["optimizer"])
+
+        self.logger.info(
+            "Checkpoint loaded. Resume training from epoch {}".format(self.start_epoch)
+        )
diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/__init__.py b/automatic-speech-recognition/hw_asr/batch_sampler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d69f87c4705d5f47a59728374b1c745ed1203ba
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/batch_sampler/__init__.py
@@ -0,0 +1,5 @@
+from hw_asr.batch_sampler.group_sort_batch_sampler import GroupLengthBatchSampler
+
+__all__ = [
+    "GroupLengthBatchSampler"
+]
diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5489de66564ec3f47489b6dc03aec09f670aa3f4
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..866c6c3e573a34c53169ee657ad41bda2c3ff9b9
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-310.pyc b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df56ba062aa68a32c4d83a95073575f66ba73eba
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-311.pyc b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18f791f450914ce571802475392d8eb608f56689
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/batch_sampler/__pycache__/group_sort_batch_sampler.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/batch_sampler/group_sort_batch_sampler.py b/automatic-speech-recognition/hw_asr/batch_sampler/group_sort_batch_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b230af74e81f4c88638ef8e5b1096d517dbab36
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/batch_sampler/group_sort_batch_sampler.py
@@ -0,0 +1,14 @@
+from torch.utils.data import Sampler
+
+
+class GroupLengthBatchSampler(Sampler):
+    def __init__(self, data_source, batch_size, batches_per_group=20):
+        super().__init__(data_source)
+        # TODO: your code here (optional)
+        raise NotImplementedError()
+
+    def __iter__(self):
+        raise NotImplementedError()
+
+    def __len__(self):
+        raise NotImplementedError()
diff --git a/automatic-speech-recognition/hw_asr/collate_fn/__init__.py b/automatic-speech-recognition/hw_asr/collate_fn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a5a841043471e1d71b90aae86ecb4c570b5ed5b
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efcda68a7faae0a26c8a25e705f5e4722ed0380c
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-310.pyc b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6b8d6c932af7bbec80e14f58687890a36d0e420
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-311.pyc b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49ac11b9328aef3caf8a8de77cd151140234a7f7
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/collate_fn/__pycache__/collate.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/collate_fn/collate.py b/automatic-speech-recognition/hw_asr/collate_fn/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..17a56b545e34aabfe02b07e8e5efc5dcd4664422
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/collate_fn/collate.py
@@ -0,0 +1,46 @@
+import torch
+import logging
+from typing import List
+
+logger = logging.getLogger(__name__)
+
+
+def collate_fn(dataset_items: List[dict]):
+    """
+    Collate and pad fields in dataset items
+    """
+    # TODO: your code here
+    feature_length_dim = dataset_items[0]["spectrogram"].shape[1]
+    time_dim = max(dataset_items, key=lambda item: item["spectrogram"].shape[2])["spectrogram"].shape[2]
+    spectrogram = torch.zeros((len(dataset_items), feature_length_dim, time_dim))
+    spectrogram_length = []
+
+    text_length_dim = max(dataset_items, key=lambda item: item["text_encoded"].shape[1])["text_encoded"].shape[1]
+    text_encoded = torch.zeros((len(dataset_items), text_length_dim))
+    text_encoded_length = []
+    text = []
+
+    audio_path = []
+    audio = []
+    for i, item in enumerate(dataset_items):
+        cur_time_dim = item["spectrogram"].shape[2]
+        spectrogram[i] = torch.cat([item["spectrogram"][0], torch.zeros((feature_length_dim, time_dim - cur_time_dim))], axis=1)
+        spectrogram_length.append(cur_time_dim)
+
+        cur_text_length_dim = item["text_encoded"].shape[1]
+        text_encoded[i] = torch.cat([item["text_encoded"][0], torch.zeros(text_length_dim - cur_text_length_dim)])
+        text_encoded_length.append(cur_text_length_dim)
+        text.append(item["text"])
+
+        audio_path.append(item["audio_path"])
+        audio.append(item["audio"])
+
+    return {
+        "spectrogram": spectrogram,
+        "spectrogram_length": torch.Tensor(spectrogram_length).to(torch.int32),
+        "text_encoded": text_encoded,
+        "text_encoded_length": torch.Tensor(text_encoded_length).to(torch.int32),
+        "text": text,
+        "audio_path": audio_path,
+        "audio": audio,
+    }
diff --git a/automatic-speech-recognition/hw_asr/configs/config.json b/automatic-speech-recognition/hw_asr/configs/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..47416448a705ad259521dba01323a9fd64691355
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/configs/config.json
@@ -0,0 +1,184 @@
+{
+  "name": "default_config",
+  "n_gpu": 1,
+  "text_encoder": {
+    "type": "CTCCharTextEncoder",
+    "args": {
+        "kenlm_model_path": "hw_asr/text_encoder/3-gram.arpa",
+        "unigrams_path": "hw_asr/text_encoder/librispeech-vocab.txt"
+    }
+  },
+  "preprocessing": {
+    "sr": 16000,
+    "spectrogram": {
+      "type": "MelSpectrogram",
+      "args": {
+        "n_mels": 256
+      }
+    },
+    "log_spec": true
+  },
+  "augmentations": {
+    "random_apply_p": 0.6,
+    "wave": [
+        {"type": "AddColoredNoise", "args": {"p": 1, "sample_rate": 16000}},
+        {"type": "Gain", "args": {"p": 0.8, "sample_rate": 16000}},
+        {"type": "HighPassFilter", "args": {"p": 0, "sample_rate": 16000}},
+        {"type": "LowPassFilter", "args": {"p": 0, "sample_rate": 16000}},
+        {"type": "PitchShift", "args": {"p": 0.8, "min_transpose_semitones": -2, "max_transpose_semitones": 2, "sample_rate": 16000}},
+        {"type": "PolarityInversion", "args": {"p": 0.8, "sample_rate": 16000}},
+        {"type": "Shift", "args": {"p": 0.8, "sample_rate": 16000}}
+    ],
+    "spectrogram": []
+  },
+  "arch": {
+    "type": "DeepSpeech2Model",
+    "args": {
+      "n_feats": 256,
+      "n_rnn_layers": 5,
+      "rnn_hidden_size": 512,
+      "rnn_dropout": 0.2
+    }
+  },
+  "data": {
+    "train": {
+      "batch_size": 128,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-clean-100",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        },
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-clean-360",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        },
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-other-500",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        }
+      ]
+    },
+    "val": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean"
+          }
+        }
+      ]
+    },
+    "test-other": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "test-other"
+          }
+        }
+      ]
+    },
+    "test-clean": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "test-clean"
+          }
+        }
+      ]
+    }
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "args": {
+      "lr": 5e-4,
+      "weight_decay": 1e-3
+    }
+  },
+  "loss": {
+    "type": "CTCLoss",
+    "args": {}
+  },
+  "metrics": [
+    {
+      "type": "ArgmaxWERMetric",
+      "args": {
+        "name": "WER (argmax)"
+      }
+    },
+    {
+      "type": "ArgmaxCERMetric",
+      "args": {
+        "name": "CER (argmax)"
+      }
+    },
+    {
+      "type": "BeamSearchWERMetric",
+      "args": {
+        "beam_size": 4,
+        "name": "WER (beam search)"
+      }
+    },
+    {
+      "type": "BeamSearchCERMetric",
+      "args": {
+        "beam_size": 4,
+        "name": "CER (beam search)"
+      }
+    },
+    {
+      "type": "LanguageModelWERMetric",
+      "args": {
+        "name": "WER (LM)"
+      }
+    },
+    {
+      "type": "LanguageModelCERMetric",
+      "args": {
+        "name": "CER (LM)"
+      }
+    }
+  ],
+  "lr_scheduler": {
+    "type": "OneCycleLR",
+    "args": {
+      "steps_per_epoch": 1000,
+      "epochs": 50,
+      "anneal_strategy": "cos",
+      "max_lr": 5e-4,
+      "pct_start": 0.1
+    }
+  },
+  "trainer": {
+    "epochs": 50,
+    "save_dir": "saved/",
+    "save_period": 5,
+    "verbosity": 2,
+    "monitor": "min val_loss",
+    "early_stop": 100,
+    "visualize": "wandb",
+    "wandb_project": "asr_project",
+    "len_epoch": 1000,
+    "grad_norm_clip": 10
+  }
+}
diff --git a/automatic-speech-recognition/hw_asr/configs/config1.json b/automatic-speech-recognition/hw_asr/configs/config1.json
new file mode 100644
index 0000000000000000000000000000000000000000..47416448a705ad259521dba01323a9fd64691355
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/configs/config1.json
@@ -0,0 +1,184 @@
+{
+  "name": "default_config",
+  "n_gpu": 1,
+  "text_encoder": {
+    "type": "CTCCharTextEncoder",
+    "args": {
+        "kenlm_model_path": "hw_asr/text_encoder/3-gram.arpa",
+        "unigrams_path": "hw_asr/text_encoder/librispeech-vocab.txt"
+    }
+  },
+  "preprocessing": {
+    "sr": 16000,
+    "spectrogram": {
+      "type": "MelSpectrogram",
+      "args": {
+        "n_mels": 256
+      }
+    },
+    "log_spec": true
+  },
+  "augmentations": {
+    "random_apply_p": 0.6,
+    "wave": [
+        {"type": "AddColoredNoise", "args": {"p": 1, "sample_rate": 16000}},
+        {"type": "Gain", "args": {"p": 0.8, "sample_rate": 16000}},
+        {"type": "HighPassFilter", "args": {"p": 0, "sample_rate": 16000}},
+        {"type": "LowPassFilter", "args": {"p": 0, "sample_rate": 16000}},
+        {"type": "PitchShift", "args": {"p": 0.8, "min_transpose_semitones": -2, "max_transpose_semitones": 2, "sample_rate": 16000}},
+        {"type": "PolarityInversion", "args": {"p": 0.8, "sample_rate": 16000}},
+        {"type": "Shift", "args": {"p": 0.8, "sample_rate": 16000}}
+    ],
+    "spectrogram": []
+  },
+  "arch": {
+    "type": "DeepSpeech2Model",
+    "args": {
+      "n_feats": 256,
+      "n_rnn_layers": 5,
+      "rnn_hidden_size": 512,
+      "rnn_dropout": 0.2
+    }
+  },
+  "data": {
+    "train": {
+      "batch_size": 128,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-clean-100",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        },
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-clean-360",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        },
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-other-500",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        }
+      ]
+    },
+    "val": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean"
+          }
+        }
+      ]
+    },
+    "test-other": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "test-other"
+          }
+        }
+      ]
+    },
+    "test-clean": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "test-clean"
+          }
+        }
+      ]
+    }
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "args": {
+      "lr": 5e-4,
+      "weight_decay": 1e-3
+    }
+  },
+  "loss": {
+    "type": "CTCLoss",
+    "args": {}
+  },
+  "metrics": [
+    {
+      "type": "ArgmaxWERMetric",
+      "args": {
+        "name": "WER (argmax)"
+      }
+    },
+    {
+      "type": "ArgmaxCERMetric",
+      "args": {
+        "name": "CER (argmax)"
+      }
+    },
+    {
+      "type": "BeamSearchWERMetric",
+      "args": {
+        "beam_size": 4,
+        "name": "WER (beam search)"
+      }
+    },
+    {
+      "type": "BeamSearchCERMetric",
+      "args": {
+        "beam_size": 4,
+        "name": "CER (beam search)"
+      }
+    },
+    {
+      "type": "LanguageModelWERMetric",
+      "args": {
+        "name": "WER (LM)"
+      }
+    },
+    {
+      "type": "LanguageModelCERMetric",
+      "args": {
+        "name": "CER (LM)"
+      }
+    }
+  ],
+  "lr_scheduler": {
+    "type": "OneCycleLR",
+    "args": {
+      "steps_per_epoch": 1000,
+      "epochs": 50,
+      "anneal_strategy": "cos",
+      "max_lr": 5e-4,
+      "pct_start": 0.1
+    }
+  },
+  "trainer": {
+    "epochs": 50,
+    "save_dir": "saved/",
+    "save_period": 5,
+    "verbosity": 2,
+    "monitor": "min val_loss",
+    "early_stop": 100,
+    "visualize": "wandb",
+    "wandb_project": "asr_project",
+    "len_epoch": 1000,
+    "grad_norm_clip": 10
+  }
+}
diff --git a/automatic-speech-recognition/hw_asr/configs/config2.json b/automatic-speech-recognition/hw_asr/configs/config2.json
new file mode 100644
index 0000000000000000000000000000000000000000..166d5dfe9997d27fe055bff049f8621267f5b7dd
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/configs/config2.json
@@ -0,0 +1,189 @@
+{
+  "name": "default_config",
+  "n_gpu": 1,
+  "text_encoder": {
+    "type": "CTCCharTextEncoder",
+    "args": {
+        "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa",
+        "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt"
+    }
+  },
+  "preprocessing": {
+    "sr": 16000,
+    "spectrogram": {
+      "type": "MelSpectrogram",
+      "args": {
+        "n_mels": 256
+      }
+    },
+    "log_spec": true
+  },
+  "augmentations": {
+    "random_apply_p": 0.6,
+    "wave": [
+        {"type": "AddColoredNoise", "args": {"p": 1, "sample_rate": 16000}},
+        {"type": "Gain", "args": {"p": 0.8, "sample_rate": 16000}},
+        {"type": "HighPassFilter", "args": {"p": 0, "sample_rate": 16000}},
+        {"type": "LowPassFilter", "args": {"p": 0, "sample_rate": 16000}},
+        {"type": "PitchShift", "args": {"p": 0.8, "min_transpose_semitones": -2, "max_transpose_semitones": 2, "sample_rate": 16000}},
+        {"type": "PolarityInversion", "args": {"p": 0.8, "sample_rate": 16000}},
+        {"type": "Shift", "args": {"p": 0.8, "sample_rate": 16000}}
+    ],
+    "spectrogram": [
+        {"type": "TimeMasking", "args": {"time_mask_param": 80, "p": 0.05}},
+        {"type": "TimeMasking", "args": {"time_mask_param": 80, "p": 0.05}},
+        {"type": "TimeMasking", "args": {"time_mask_param": 80, "p": 0.05}},
+        {"type": "FrequencyMasking", "args": {"freq_mask_param": 80}}
+    ]
+  },
+  "arch": {
+    "type": "DeepSpeech2Model",
+    "args": {
+      "n_feats": 256,
+      "n_rnn_layers": 6,
+      "rnn_hidden_size": 512,
+      "rnn_dropout": 0.2
+    }
+  },
+  "data": {
+    "train": {
+      "batch_size": 128,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-clean-100",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        },
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-clean-360",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        },
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-other-500",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        }
+      ]
+    },
+    "val": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean"
+          }
+        }
+      ]
+    },
+    "test-other": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "test-other"
+          }
+        }
+      ]
+    },
+    "test-clean": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "test-clean"
+          }
+        }
+      ]
+    }
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "args": {
+      "lr": 3e-4,
+      "weight_decay": 1e-5
+    }
+  },
+  "loss": {
+    "type": "CTCLoss",
+    "args": {}
+  },
+  "metrics": [
+    {
+      "type": "ArgmaxWERMetric",
+      "args": {
+        "name": "WER (argmax)"
+      }
+    },
+    {
+      "type": "ArgmaxCERMetric",
+      "args": {
+        "name": "CER (argmax)"
+      }
+    },
+    {
+      "type": "BeamSearchWERMetric",
+      "args": {
+        "beam_size": 4,
+        "name": "WER (beam search)"
+      }
+    },
+    {
+      "type": "BeamSearchCERMetric",
+      "args": {
+        "beam_size": 4,
+        "name": "CER (beam search)"
+      }
+    },
+    {
+      "type": "LanguageModelWERMetric",
+      "args": {
+        "name": "WER (LM)"
+      }
+    },
+    {
+      "type": "LanguageModelCERMetric",
+      "args": {
+        "name": "CER (LM)"
+      }
+    }
+  ],
+  "lr_scheduler": {
+    "type": "OneCycleLR",
+    "args": {
+      "steps_per_epoch": 1000,
+      "epochs": 50,
+      "anneal_strategy": "cos",
+      "max_lr": 3e-4,
+      "pct_start": 0.1
+    }
+  },
+  "trainer": {
+    "epochs": 50,
+    "save_dir": "saved/",
+    "save_period": 5,
+    "verbosity": 2,
+    "monitor": "min val_loss",
+    "early_stop": 100,
+    "visualize": "wandb",
+    "wandb_project": "asr_project",
+    "len_epoch": 1000,
+    "grad_norm_clip": 10
+  }
+}
diff --git a/automatic-speech-recognition/hw_asr/configs/config_clean.json b/automatic-speech-recognition/hw_asr/configs/config_clean.json
new file mode 100644
index 0000000000000000000000000000000000000000..47416448a705ad259521dba01323a9fd64691355
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/configs/config_clean.json
@@ -0,0 +1,184 @@
+{
+  "name": "default_config",
+  "n_gpu": 1,
+  "text_encoder": {
+    "type": "CTCCharTextEncoder",
+    "args": {
+        "kenlm_model_path": "hw_asr/text_encoder/3-gram.arpa",
+        "unigrams_path": "hw_asr/text_encoder/librispeech-vocab.txt"
+    }
+  },
+  "preprocessing": {
+    "sr": 16000,
+    "spectrogram": {
+      "type": "MelSpectrogram",
+      "args": {
+        "n_mels": 256
+      }
+    },
+    "log_spec": true
+  },
+  "augmentations": {
+    "random_apply_p": 0.6,
+    "wave": [
+        {"type": "AddColoredNoise", "args": {"p": 1, "sample_rate": 16000}},
+        {"type": "Gain", "args": {"p": 0.8, "sample_rate": 16000}},
+        {"type": "HighPassFilter", "args": {"p": 0, "sample_rate": 16000}},
+        {"type": "LowPassFilter", "args": {"p": 0, "sample_rate": 16000}},
+        {"type": "PitchShift", "args": {"p": 0.8, "min_transpose_semitones": -2, "max_transpose_semitones": 2, "sample_rate": 16000}},
+        {"type": "PolarityInversion", "args": {"p": 0.8, "sample_rate": 16000}},
+        {"type": "Shift", "args": {"p": 0.8, "sample_rate": 16000}}
+    ],
+    "spectrogram": []
+  },
+  "arch": {
+    "type": "DeepSpeech2Model",
+    "args": {
+      "n_feats": 256,
+      "n_rnn_layers": 5,
+      "rnn_hidden_size": 512,
+      "rnn_dropout": 0.2
+    }
+  },
+  "data": {
+    "train": {
+      "batch_size": 128,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-clean-100",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        },
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-clean-360",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        },
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-other-500",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        }
+      ]
+    },
+    "val": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean"
+          }
+        }
+      ]
+    },
+    "test-other": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "test-other"
+          }
+        }
+      ]
+    },
+    "test-clean": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "test-clean"
+          }
+        }
+      ]
+    }
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "args": {
+      "lr": 5e-4,
+      "weight_decay": 1e-3
+    }
+  },
+  "loss": {
+    "type": "CTCLoss",
+    "args": {}
+  },
+  "metrics": [
+    {
+      "type": "ArgmaxWERMetric",
+      "args": {
+        "name": "WER (argmax)"
+      }
+    },
+    {
+      "type": "ArgmaxCERMetric",
+      "args": {
+        "name": "CER (argmax)"
+      }
+    },
+    {
+      "type": "BeamSearchWERMetric",
+      "args": {
+        "beam_size": 4,
+        "name": "WER (beam search)"
+      }
+    },
+    {
+      "type": "BeamSearchCERMetric",
+      "args": {
+        "beam_size": 4,
+        "name": "CER (beam search)"
+      }
+    },
+    {
+      "type": "LanguageModelWERMetric",
+      "args": {
+        "name": "WER (LM)"
+      }
+    },
+    {
+      "type": "LanguageModelCERMetric",
+      "args": {
+        "name": "CER (LM)"
+      }
+    }
+  ],
+  "lr_scheduler": {
+    "type": "OneCycleLR",
+    "args": {
+      "steps_per_epoch": 1000,
+      "epochs": 50,
+      "anneal_strategy": "cos",
+      "max_lr": 5e-4,
+      "pct_start": 0.1
+    }
+  },
+  "trainer": {
+    "epochs": 50,
+    "save_dir": "saved/",
+    "save_period": 5,
+    "verbosity": 2,
+    "monitor": "min val_loss",
+    "early_stop": 100,
+    "visualize": "wandb",
+    "wandb_project": "asr_project",
+    "len_epoch": 1000,
+    "grad_norm_clip": 10
+  }
+}
diff --git a/automatic-speech-recognition/hw_asr/configs/finetune.json b/automatic-speech-recognition/hw_asr/configs/finetune.json
new file mode 100644
index 0000000000000000000000000000000000000000..e95bad1543fc464fc8fa89eb62ebc63cdd3d519a
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/configs/finetune.json
@@ -0,0 +1,122 @@
+{
+  "name": "default_config",
+  "n_gpu": 1,
+  "text_encoder": {
+    "type": "CTCCharTextEncoder",
+    "args": {
+        "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa",
+        "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt"
+    }
+  },
+  "preprocessing": {
+    "sr": 16000,
+    "spectrogram": {
+      "type": "MelSpectrogram",
+      "args": {
+        "n_mels": 256
+      }
+    },
+    "log_spec": true
+  },
+  "augmentations": {
+    "random_apply_p": 0,
+    "wave": [],
+    "spectrogram": []
+  },
+  "arch": {
+    "type": "DeepSpeech2Model",
+    "args": {
+      "n_feats": 256,
+      "n_rnn_layers": 6,
+      "rnn_hidden_size": 512,
+      "rnn_dropout": 0.2
+    }
+  },
+  "data": {
+    "train": {
+      "batch_size": 128,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "train-other-500",
+            "max_audio_length": 40.0,
+            "max_text_length": 400
+          }
+        }
+      ]
+    },
+    "val": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean"
+          }
+        }
+      ]
+    },
+    "test-other": {
+      "batch_size": 64,
+      "num_workers": 4,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "test-other"
+          }
+        }
+      ]
+    }
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "args": {
+      "lr": 6e-5,
+      "weight_decay": 1e-5
+    }
+  },
+  "loss": {
+    "type": "CTCLoss",
+    "args": {}
+  },
+  "metrics": [
+    {
+      "type": "ArgmaxWERMetric",
+      "args": {
+        "name": "WER (argmax)"
+      }
+    },
+    {
+      "type": "ArgmaxCERMetric",
+      "args": {
+        "name": "CER (argmax)"
+      }
+    }
+  ],
+  "lr_scheduler": {
+    "type": "OneCycleLR",
+    "args": {
+      "steps_per_epoch": 1000,
+      "epochs": 10,
+      "anneal_strategy": "cos",
+      "max_lr": 6e-5,
+      "pct_start": 0.2
+    }
+  },
+  "trainer": {
+    "epochs": 10,
+    "save_dir": "saved/",
+    "save_period": 5,
+    "verbosity": 2,
+    "monitor": "min val_loss",
+    "early_stop": 100,
+    "visualize": "wandb",
+    "wandb_project": "asr_project",
+    "len_epoch": 1000,
+    "grad_norm_clip": 10
+  }
+}
diff --git a/automatic-speech-recognition/hw_asr/configs/one_batch_test_baseline.json b/automatic-speech-recognition/hw_asr/configs/one_batch_test_baseline.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1d5502d2f43360646306f8faf9e2271f171ba97
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/configs/one_batch_test_baseline.json
@@ -0,0 +1,102 @@
+{
+  "name": "one_batch_test",
+  "n_gpu": 1,
+  "preprocessing": {
+    "sr": 16000,
+    "spectrogram": {
+      "type": "MelSpectrogram",
+      "args": {
+      }
+    },
+    "log_spec": true
+  },
+  "augmentations": {
+    "wave": [],
+    "spectrogram": []
+  },
+  "arch": {
+    "type": "BaselineModel",
+    "args": {
+      "n_feats": 128,
+      "fc_hidden": 512
+    }
+  },
+  "data": {
+    "train": {
+      "batch_size": 10,
+      "num_workers": 0,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean",
+            "max_audio_length": 20.0,
+            "max_text_length": 200,
+            "limit": 10
+          }
+        }
+      ]
+    },
+    "val": {
+      "batch_size": 10,
+      "num_workers": 0,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean",
+            "max_audio_length": 20.0,
+            "max_text_length": 200,
+            "limit": 10
+          }
+        }
+      ]
+    }
+  },
+  "optimizer": {
+    "type": "SGD",
+    "args": {
+      "lr": 1e-2
+    }
+  },
+  "loss": {
+    "type": "CTCLoss",
+    "args": {}
+  },
+  "metrics": [
+    {
+      "type": "ArgmaxWERMetric",
+      "args": {
+        "name": "WER (argmax)"
+      }
+    },
+    {
+      "type": "ArgmaxCERMetric",
+      "args": {
+        "name": "CER (argmax)"
+      }
+    }
+  ],
+  "lr_scheduler": {
+    "type": "OneCycleLR",
+    "args": {
+      "steps_per_epoch": 100,
+      "epochs": 50,
+      "anneal_strategy": "cos",
+      "max_lr": 1e-2,
+      "pct_start": 0.2
+    }
+  },
+  "trainer": {
+    "epochs": 50,
+    "save_dir": "saved/",
+    "save_period": 5,
+    "verbosity": 2,
+    "monitor": "min val_loss",
+    "early_stop": 100,
+    "visualize": "wandb",
+    "wandb_project": "asr_project_check",
+    "len_epoch": 100,
+    "grad_norm_clip": 10
+  }
+}
diff --git a/automatic-speech-recognition/hw_asr/configs/one_batch_test_deepspeech2.json b/automatic-speech-recognition/hw_asr/configs/one_batch_test_deepspeech2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9757e654a9b23286fc3f301859e18e73431b629
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/configs/one_batch_test_deepspeech2.json
@@ -0,0 +1,133 @@
+{
+  "name": "one_batch_test",
+  "n_gpu": 1,
+  "text_encoder": {
+    "type": "CTCCharTextEncoder",
+    "args": {
+        "kenlm_model_path": "hw_asr/text_encoder/3-gram.arpa",
+        "unigrams_path": "hw_asr/text_encoder/librispeech-vocab.txt"
+    }
+  },
+  "preprocessing": {
+    "sr": 16000,
+    "spectrogram": {
+      "type": "MelSpectrogram",
+      "args": {
+        "n_mels": 512
+      }
+    },
+    "log_spec": true
+  },
+  "augmentations": {
+    "wave": [
+        {"type": "AddColoredNoise", "args": {"p": 0.3, "sample_rate": 16000}},
+        {"type": "Gain", "args": {"p": 0.4, "sample_rate": 16000}},
+        {"type": "HighPassFilter", "args": {"p": 0.3, "sample_rate": 16000}},
+        {"type": "LowPassFilter", "args": {"p": 0.3, "sample_rate": 16000}},
+        {"type": "PitchShift", "args": {"p": 0.3, "sample_rate": 16000}},
+        {"type": "PolarityInversion", "args": {"p": 0, "sample_rate": 16000}},
+        {"type": "Shift", "args": {"p": 0.2, "sample_rate": 16000}}
+    ],
+    "spectrogram": []
+  },
+  "arch": {
+    "type": "DeepSpeech2Model",
+    "args": {
+      "n_feats": 512,
+      "n_rnn_layers": 1,
+      "rnn_hidden_size": 256
+    }
+  },
+  "data": {
+    "train": {
+      "batch_size": 10,
+      "num_workers": 0,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean",
+            "max_audio_length": 20.0,
+            "max_text_length": 200,
+            "limit": 10
+          }
+        }
+      ]
+    },
+    "val": {
+      "batch_size": 10,
+      "num_workers": 0,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean",
+            "max_audio_length": 20.0,
+            "max_text_length": 200,
+            "limit": 10
+          }
+        }
+      ]
+    }
+  },
+  "optimizer": {
+    "type": "SGD",
+    "args": {
+      "lr": 1e-2
+    }
+  },
+  "loss": {
+    "type": "CTCLoss",
+    "args": {}
+  },
+  "metrics": [
+    {
+      "type": "ArgmaxWERMetric",
+      "args": {
+        "name": "WER (argmax)"
+      }
+    },
+    {
+      "type": "ArgmaxCERMetric",
+      "args": {
+        "name": "CER (argmax)"
+      }
+    },
+    {
+      "type": "BeamSearchWERMetric",
+      "args": {
+        "beam_size": 2,
+        "name": "WER (beam search)"
+      }
+    },
+    {
+      "type": "BeamSearchCERMetric",
+      "args": {
+        "beam_size": 2,
+        "name": "CER (beam search)"
+      }
+    }
+  ],
+  "lr_scheduler": {
+    "type": "OneCycleLR",
+    "args": {
+      "steps_per_epoch": 100,
+      "epochs": 50,
+      "anneal_strategy": "cos",
+      "max_lr": 1e-2,
+      "pct_start": 0.2
+    }
+  },
+  "trainer": {
+    "epochs": 50,
+    "save_dir": "saved/",
+    "save_period": 5,
+    "verbosity": 2,
+    "monitor": "min val_loss",
+    "early_stop": 100,
+    "visualize": "wandb",
+    "wandb_project": "asr_project_check",
+    "len_epoch": 100,
+    "grad_norm_clip": 10
+  }
+}
diff --git a/automatic-speech-recognition/hw_asr/datasets/__init__.py b/automatic-speech-recognition/hw_asr/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca1615b8c16bc4eaca881ed7e78ca0ed356fc6d
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/datasets/__init__.py
@@ -0,0 +1,13 @@
+from hw_asr.datasets.custom_audio_dataset import CustomAudioDataset
+from hw_asr.datasets.custom_dir_audio_dataset import CustomDirAudioDataset
+from hw_asr.datasets.librispeech_dataset import LibrispeechDataset
+from hw_asr.datasets.ljspeech_dataset import LJspeechDataset
+from hw_asr.datasets.common_voice import CommonVoiceDataset
+
+__all__ = [
+    "LibrispeechDataset",
+    "CustomDirAudioDataset",
+    "CustomAudioDataset",
+    "LJspeechDataset",
+    "CommonVoiceDataset"
+]
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13a3597ba6f1585bda059a8e6fdd6a3eca8987e6
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51bb1449d7ef07d06dc61c7bd70baa27f6a9b327
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f6cecbad86b3231fb332d08e1d424f79577eded
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..664916dd2c7b9b06d740913d59b21dde0b8b5b7f
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/common_voice.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba296c016e5b91a2715eb57aaf1022095ad9be38
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90fa083bca1abd0e782e41a5608a00b5b630139a
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_audio_dataset.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe184ea7cbbb222cceec074c92b428dad3185903
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1919b37b6eea6f68978d885abe497317d1680d52
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/custom_dir_audio_dataset.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc6fcea417299a09694842271701a5e48d31fbd7
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4edba4fb8ea5477a419c52ecee78181015cf8c0
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/librispeech_dataset.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-310.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..420383191298c1f31fe7f40264420c3f3ebf8436
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-311.pyc b/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27895d59d639b4dccf50b9f61ff595592d9b3bd5
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/datasets/__pycache__/ljspeech_dataset.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/datasets/common_voice.py b/automatic-speech-recognition/hw_asr/datasets/common_voice.py
new file mode 100644
index 0000000000000000000000000000000000000000..da5bb92231cfc0a72ff14287bae742b9a648b4df
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/datasets/common_voice.py
@@ -0,0 +1,45 @@
+import logging
+from pathlib import Path
+import json
+
+import torchaudio
+from datasets import load_dataset
+import re
+from tqdm import tqdm
+
+from hw_asr.base.base_dataset import BaseDataset
+from hw_asr.utils import ROOT_PATH
+
+logger = logging.getLogger(__name__)
+
+
+class CommonVoiceDataset(BaseDataset):
+    def __init__(self, split, *args, **kwargs):
+        self._data_dir = ROOT_PATH / "dataset_common_voice"
+        self._regex = re.compile("[^a-z ]")
+        self._dataset = load_dataset("common_voice", "en", cache_dir=self._data_dir, split=split)
+        index = self._get_or_load_index(split)
+        super().__init__(index, *args, **kwargs)
+
+    def _get_or_load_index(self, split):
+        index_path = self._data_dir / f"{split}_index.json"
+        if index_path.exists():
+            with index_path.open() as f:
+                index = json.load(f)
+        else:
+            index = []
+            for entry in tqdm(self._dataset):
+                assert "path" in entry
+                assert Path(entry["path"]).exists(), f"Path {entry['path']} doesn't exist"
+                entry["path"] = str(Path(entry["path"]).absolute().resolve())
+                entry["text"] = self._regex.sub("", entry.get("sentence", "").lower())
+                t_info = torchaudio.info(entry["path"])
+                entry["audio_len"] = t_info.num_frames / t_info.sample_rate
+                index.append({
+                            "path": entry["path"],
+                            "text": entry["text"],
+                            "audio_len": entry["audio_len"],
+                        })
+            with index_path.open("w") as f:
+                json.dump(index, f, indent=2)
+        return index
diff --git a/automatic-speech-recognition/hw_asr/datasets/custom_audio_dataset.py b/automatic-speech-recognition/hw_asr/datasets/custom_audio_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..80b582c87eb74f0dd762a7f55d18c7aacd692d60
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/datasets/custom_audio_dataset.py
@@ -0,0 +1,22 @@
+import logging
+from pathlib import Path
+
+import torchaudio
+
+from hw_asr.base.base_dataset import BaseDataset
+
+logger = logging.getLogger(__name__)
+
+
+class CustomAudioDataset(BaseDataset):
+    def __init__(self, data, *args, **kwargs):
+        index = data
+        for entry in data:
+            assert "path" in entry
+            assert Path(entry["path"]).exists(), f"Path {entry['path']} doesn't exist"
+            entry["path"] = str(Path(entry["path"]).absolute().resolve())
+            entry["text"] = entry.get("text", "")
+            t_info = torchaudio.info(entry["path"])
+            entry["audio_len"] = t_info.num_frames / t_info.sample_rate
+
+        super().__init__(index, *args, **kwargs)
diff --git a/automatic-speech-recognition/hw_asr/datasets/custom_dir_audio_dataset.py b/automatic-speech-recognition/hw_asr/datasets/custom_dir_audio_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..084efa07289c1c6e574d92e331c59e6e6aee02d0
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/datasets/custom_dir_audio_dataset.py
@@ -0,0 +1,23 @@
+import logging
+from pathlib import Path
+
+from hw_asr.datasets.custom_audio_dataset import CustomAudioDataset
+
+logger = logging.getLogger(__name__)
+
+
+class CustomDirAudioDataset(CustomAudioDataset):
+    def __init__(self, audio_dir, transcription_dir=None, *args, **kwargs):
+        data = []
+        for path in Path(audio_dir).iterdir():
+            entry = {}
+            if path.suffix in [".mp3", ".wav", ".flac", ".m4a"]:
+                entry["path"] = str(path)
+                if transcription_dir and Path(transcription_dir).exists():
+                    transc_path = Path(transcription_dir) / (path.stem + '.txt')
+                    if transc_path.exists():
+                        with transc_path.open() as f:
+                            entry["text"] = f.read().strip()
+            if len(entry) > 0:
+                data.append(entry)
+        super().__init__(data, *args, **kwargs)
diff --git a/automatic-speech-recognition/hw_asr/datasets/librispeech_dataset.py b/automatic-speech-recognition/hw_asr/datasets/librispeech_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee0bc051a07e681aa495f33706273592ce703e77
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/datasets/librispeech_dataset.py
@@ -0,0 +1,93 @@
+import json
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import torchaudio
+from speechbrain.utils.data_utils import download_file
+from tqdm import tqdm
+
+from hw_asr.base.base_dataset import BaseDataset
+from hw_asr.utils import ROOT_PATH
+
+logger = logging.getLogger(__name__)
+
+URL_LINKS = {
+    "dev-clean": "https://www.openslr.org/resources/12/dev-clean.tar.gz",
+    "dev-other": "https://www.openslr.org/resources/12/dev-other.tar.gz",
+    "test-clean": "https://www.openslr.org/resources/12/test-clean.tar.gz",
+    "test-other": "https://www.openslr.org/resources/12/test-other.tar.gz",
+    "train-clean-100": "https://www.openslr.org/resources/12/train-clean-100.tar.gz",
+    "train-clean-360": "https://www.openslr.org/resources/12/train-clean-360.tar.gz",
+    "train-other-500": "https://www.openslr.org/resources/12/train-other-500.tar.gz",
+}
+
+
+class LibrispeechDataset(BaseDataset):
+    def __init__(self, part, data_dir=None, *args, **kwargs):
+        assert part in URL_LINKS or part == 'train_all'
+
+        if data_dir is None:
+            data_dir = ROOT_PATH / "data" / "datasets" / "librispeech"
+            data_dir.mkdir(exist_ok=True, parents=True)
+        self._data_dir = data_dir
+        if part == 'train_all':
+            index = sum([self._get_or_load_index(part)
+                         for part in URL_LINKS if 'train' in part], [])
+        else:
+            index = self._get_or_load_index(part)
+
+        super().__init__(index, *args, **kwargs)
+
+    def _load_part(self, part):
+        arch_path = self._data_dir / f"{part}.tar.gz"
+        print(f"Loading part {part}")
+        download_file(URL_LINKS[part], arch_path)
+        shutil.unpack_archive(arch_path, self._data_dir)
+        for fpath in (self._data_dir / "LibriSpeech").iterdir():
+            shutil.move(str(fpath), str(self._data_dir / fpath.name))
+        os.remove(str(arch_path))
+        shutil.rmtree(str(self._data_dir / "LibriSpeech"))
+
+    def _get_or_load_index(self, part):
+        index_path = self._data_dir / f"{part}_index.json"
+        if index_path.exists():
+            with index_path.open() as f:
+                index = json.load(f)
+        else:
+            index = self._create_index(part)
+            with index_path.open("w") as f:
+                json.dump(index, f, indent=2)
+        return index
+
+    def _create_index(self, part):
+        index = []
+        split_dir = self._data_dir / part
+        if not split_dir.exists():
+            self._load_part(part)
+
+        flac_dirs = set()
+        for dirpath, dirnames, filenames in os.walk(str(split_dir)):
+            if any([f.endswith(".flac") for f in filenames]):
+                flac_dirs.add(dirpath)
+        for flac_dir in tqdm(
+                list(flac_dirs), desc=f"Preparing librispeech folders: {part}"
+        ):
+            flac_dir = Path(flac_dir)
+            trans_path = list(flac_dir.glob("*.trans.txt"))[0]
+            with trans_path.open() as f:
+                for line in f:
+                    f_id = line.split()[0]
+                    f_text = " ".join(line.split()[1:]).strip()
+                    flac_path = flac_dir / f"{f_id}.flac"
+                    t_info = torchaudio.info(str(flac_path))
+                    length = t_info.num_frames / t_info.sample_rate
+                    index.append(
+                        {
+                            "path": str(flac_path.absolute().resolve()),
+                            "text": f_text.lower(),
+                            "audio_len": length,
+                        }
+                    )
+        return index
diff --git a/automatic-speech-recognition/hw_asr/datasets/ljspeech_dataset.py b/automatic-speech-recognition/hw_asr/datasets/ljspeech_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..adf9430a17e1e81a5e14cbf63f948a6257813b9f
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/datasets/ljspeech_dataset.py
@@ -0,0 +1,96 @@
+import json
+import logging
+import os
+import shutil
+from curses.ascii import isascii
+from pathlib import Path
+
+import torchaudio
+from hw_asr.base.base_dataset import BaseDataset
+from hw_asr.utils import ROOT_PATH
+from speechbrain.utils.data_utils import download_file
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+URL_LINKS = {
+    "dataset": "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", 
+}
+
+
+class LJspeechDataset(BaseDataset):
+    def __init__(self, part, data_dir=None, *args, **kwargs):
+        if data_dir is None:
+            data_dir = ROOT_PATH / "data" / "datasets" / "ljspeech"
+            data_dir.mkdir(exist_ok=True, parents=True)
+        self._data_dir = data_dir
+        index = self._get_or_load_index(part)
+
+        super().__init__(index, *args, **kwargs)
+
+    def _load_dataset(self):
+        arch_path = self._data_dir / "LJSpeech-1.1.tar.bz2"
+        print(f"Loading LJSpeech")
+        download_file(URL_LINKS["dataset"], arch_path)
+        shutil.unpack_archive(arch_path, self._data_dir)
+        for fpath in (self._data_dir / "LJSpeech-1.1").iterdir():
+            shutil.move(str(fpath), str(self._data_dir / fpath.name))
+        os.remove(str(arch_path))
+        shutil.rmtree(str(self._data_dir / "LJSpeech-1.1"))
+
+        files = [file_name for file_name in (self._data_dir / "wavs").iterdir()]
+        train_length = int(0.85 * len(files)) # hand split, test ~ 15% 
+        (self._data_dir / "train").mkdir(exist_ok=True, parents=True)
+        (self._data_dir / "test").mkdir(exist_ok=True, parents=True)
+        for i, fpath in enumerate((self._data_dir / "wavs").iterdir()):
+            if i < train_length:
+                shutil.move(str(fpath), str(self._data_dir / "train" / fpath.name))
+            else:
+                shutil.move(str(fpath), str(self._data_dir / "test" / fpath.name))
+        shutil.rmtree(str(self._data_dir / "wavs"))
+
+
+    def _get_or_load_index(self, part):
+        index_path = self._data_dir / f"{part}_index.json"
+        if index_path.exists():
+            with index_path.open() as f:
+                index = json.load(f)
+        else:
+            index = self._create_index(part)
+            with index_path.open("w") as f:
+                json.dump(index, f, indent=2)
+        return index
+
+    def _create_index(self, part):
+        index = []
+        split_dir = self._data_dir / part
+        if not split_dir.exists():
+            self._load_dataset()
+
+        wav_dirs = set()
+        for dirpath, dirnames, filenames in os.walk(str(split_dir)):
+            if any([f.endswith(".wav") for f in filenames]):
+                wav_dirs.add(dirpath)
+        for wav_dir in tqdm(
+                list(wav_dirs), desc=f"Preparing ljspeech folders: {part}"
+        ):
+            wav_dir = Path(wav_dir)
+            trans_path = list(self._data_dir.glob("*.csv"))[0]
+            with trans_path.open() as f:
+                for line in f:
+                    w_id = line.split('|')[0]
+                    w_text = " ".join(line.split('|')[1:]).strip()
+                    wav_path = wav_dir / f"{w_id}.wav"
+                    if not wav_path.exists(): # elem in another part
+                        continue
+                    t_info = torchaudio.info(str(wav_path))
+                    length = t_info.num_frames / t_info.sample_rate
+                    if w_text.isascii():
+                        index.append(
+                            {
+                                "path": str(wav_path.absolute().resolve()),
+                                "text": w_text.lower(),
+                                "audio_len": length,
+                            }
+                        )
+        return index
diff --git a/automatic-speech-recognition/hw_asr/logger/__init__.py b/automatic-speech-recognition/hw_asr/logger/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..549cc5b8b1b79ac0e083dde39a2e9bc3d8ae97d4
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/logger/__init__.py
@@ -0,0 +1,2 @@
+from .logger import *
+from .visualization import *
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb208e512ad7131ebe787418501770e6c245884c
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a05233729be1ab66ad96e9f42763d53d8ecaedda
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28d4e85d512960a484b7bf6f24d0d07bf6d49b6e
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad9c8fb2ed7c9590174fc8a653385aec2ba38d55
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/logger.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0973e3b4403caa591bd4db6fb42dcdac2aca91b
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7d76b69359863049ae6dd96dbc872b877ad0122
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/tensorboard.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58cdabae72d30b9cf7092f170cfff0bfdde100a9
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e788e0886a9b20c46dfa58c04f99c66c2f72bd0
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/utils.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eba8207829c73a7f46182180c9cc2aa2d44b4ffa
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f61661bf61425c290796c127ac31ce9dbec722c
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/visualization.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-310.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f0969d440dda7f1e2b494c42db9036a1ec9891c
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-311.pyc b/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08d07b23511995a42b446ae4db99e5ed69d88c65
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/logger/__pycache__/wandb.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/logger/logger.py b/automatic-speech-recognition/hw_asr/logger/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..73759e3f6a7b0a1421768adbd5c66b3aaf2374e4
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/logger/logger.py
@@ -0,0 +1,29 @@
+import logging
+import logging.config
+from pathlib import Path
+
+from hw_asr.utils import read_json, ROOT_PATH
+
+
+def setup_logging(
+        save_dir, log_config=None, default_level=logging.INFO
+):
+    """
+    Setup logging configuration
+    """
+    if log_config is None:
+        log_config = str(ROOT_PATH / "hw_asr" / "logger" / "logger_config.json")
+    log_config = Path(log_config)
+    if log_config.is_file():
+        config = read_json(log_config)
+        # modify logging paths based on run config
+        for _, handler in config["handlers"].items():
+            if "filename" in handler:
+                handler["filename"] = str(save_dir / handler["filename"])
+
+        logging.config.dictConfig(config)
+    else:
+        print(
+            "Warning: logging configuration file is not found in {}.".format(log_config)
+        )
+        logging.basicConfig(level=default_level)
diff --git a/automatic-speech-recognition/hw_asr/logger/logger_config.json b/automatic-speech-recognition/hw_asr/logger/logger_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bfebbf7b373025dfe2030d1a7933dbabb2c9a2f
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/logger/logger_config.json
@@ -0,0 +1,36 @@
+{
+  "version": 1,
+  "disable_existing_loggers": false,
+  "formatters": {
+    "simple": {
+      "format": "%(message)s"
+    },
+    "datetime": {
+      "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    }
+  },
+  "handlers": {
+    "console": {
+      "class": "logging.StreamHandler",
+      "level": "DEBUG",
+      "formatter": "simple",
+      "stream": "ext://sys.stdout"
+    },
+    "info_file_handler": {
+      "class": "logging.handlers.RotatingFileHandler",
+      "level": "INFO",
+      "formatter": "datetime",
+      "filename": "info.log",
+      "maxBytes": 10485760,
+      "backupCount": 20,
+      "encoding": "utf8"
+    }
+  },
+  "root": {
+    "level": "INFO",
+    "handlers": [
+      "console",
+      "info_file_handler"
+    ]
+  }
+}
\ No newline at end of file
diff --git a/automatic-speech-recognition/hw_asr/logger/tensorboard.py b/automatic-speech-recognition/hw_asr/logger/tensorboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8d52e0648ddfc580a074db1fd1ec600523242a5
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/logger/tensorboard.py
@@ -0,0 +1,88 @@
+import importlib
+from datetime import datetime
+
+
+class TensorboardWriter:
+    def __init__(self, log_dir, logger, enabled):
+        self.writer = None
+        self.selected_module = ""
+
+        if enabled:
+            log_dir = str(log_dir)
+
+            # Retrieve vizualization writer.
+            succeeded = False
+            for module in ["torch.utils.tensorboard", "tensorboardX"]:
+                try:
+                    self.writer = importlib.import_module(module).SummaryWriter(log_dir)
+                    succeeded = True
+                    break
+                except ImportError:
+                    succeeded = False
+                self.selected_module = module
+
+            if not succeeded:
+                message = (
+                    "Warning: visualization (Tensorboard) is configured to use, but currently not "
+                    "installed on this machine. Please install TensorboardX with "
+                    "'pip install tensorboardx', upgrade PyTorch to version >= 1.1 to use "
+                    "'torch.utils.tensorboard' or turn off the option in the 'config.json' file."
+                )
+                logger.warning(message)
+
+        self.step = 0
+        self.mode = ""
+
+        self.tb_writer_ftns = {
+            "add_scalar",
+            "add_scalars",
+            "add_image",
+            "add_images",
+            "add_audio",
+            "add_text",
+            "add_histogram",
+            "add_pr_curve",
+            "add_embedding",
+        }
+        self.tag_mode_exceptions = {"add_histogram", "add_embedding"}
+        self.timer = datetime.now()
+
+    def set_step(self, step, mode="train"):
+        self.mode = mode
+        self.step = step
+        if step == 0:
+            self.timer = datetime.now()
+        else:
+            duration = datetime.now() - self.timer
+            self.add_scalar("steps_per_sec", 1 / duration.total_seconds())
+            self.timer = datetime.now()
+
+    def __getattr__(self, name):
+        """
+        If visualization is configured to use:
+            return add_data() methods of tensorboard with additional information (step, tag) added.
+        Otherwise:
+            return a blank function handle that does nothing
+        """
+        if name in self.tb_writer_ftns:
+            add_data = getattr(self.writer, name, None)
+
+            def wrapper(tag, data, *args, **kwargs):
+                if add_data is not None:
+                    # add mode(train/valid) tag
+                    if name not in self.tag_mode_exceptions:
+                        tag = "{}/{}".format(tag, self.mode)
+                    add_data(tag, data, self.step, *args, **kwargs)
+
+            return wrapper
+        else:
+            # default action for returning methods defined in this class, set_step() for instance.
+            try:
+                attr = object.__getattr__(name)
+            except AttributeError:
+                raise AttributeError(
+                    "type object '{}' has no attribute '{}'".format(
+                        self.selected_module, name
+                    )
+                )
+            return attr
diff --git a/automatic-speech-recognition/hw_asr/logger/utils.py b/automatic-speech-recognition/hw_asr/logger/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bee693ea21562490d20a9c6420e7529329354b1a
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/logger/utils.py
@@ -0,0 +1,13 @@
+import io
+
+import matplotlib.pyplot as plt
+
+
+def plot_spectrogram_to_buf(spectrogram_tensor, name=None):
+    plt.figure(figsize=(20, 5))
+    plt.imshow(spectrogram_tensor)
+    plt.title(name)
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    buf.seek(0)
+    return buf
diff --git a/automatic-speech-recognition/hw_asr/logger/visualization.py b/automatic-speech-recognition/hw_asr/logger/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab6df31f9f96c95681f6172f2e31a43fb3b410b
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/logger/visualization.py
@@ -0,0 +1,19 @@
+from enum import Enum
+
+from .tensorboard import TensorboardWriter
+from .wandb import WanDBWriter
+
+
+class VisualizerBackendType(str, Enum):
+    tensorboard = "tensorboard"
+    wandb = "wandb"
+
+
+def get_visualizer(config, logger, backend: VisualizerBackendType):
+    if backend == VisualizerBackendType.tensorboard:
+        return TensorboardWriter(config.log_dir, logger, True)
+
+    if backend == VisualizerBackendType.wandb:
+        return WanDBWriter(config, logger)
+
+    return None
diff --git a/automatic-speech-recognition/hw_asr/logger/wandb.py b/automatic-speech-recognition/hw_asr/logger/wandb.py
new file mode 100644
index 0000000000000000000000000000000000000000..56894cd6236ddf6973edb84ba0fc3ad94247f318
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/logger/wandb.py
@@ -0,0 +1,98 @@
+from datetime import datetime
+
+import numpy as np
+import pandas as pd
+import wandb
+
+
+class WanDBWriter:
+    def __init__(self, config, logger):
+        self.writer = None
+        self.selected_module = ""
+
+        try:
+            import wandb
+            wandb.login()
+
+            if config['trainer'].get('wandb_project') is None:
+                raise ValueError("please specify project name for wandb")
+
+            wandb.init(
+                project=config['trainer'].get('wandb_project'),
+                config=config.config
+            )
+            self.wandb = wandb
+
+        except ImportError:
+            logger.warning("For use wandb install it via \n\t pip install wandb")
+
+        self.step = 0
+        self.mode = ""
+        self.timer = datetime.now()
+
+    def set_step(self, step, mode="train"):
+        self.mode = mode
+        self.step = step
+        if step == 0:
+            self.timer = datetime.now()
+        else:
+            duration = datetime.now() - self.timer
+            self.add_scalar("steps_per_sec", 1 / duration.total_seconds())
+            self.timer = datetime.now()
+
+    def _scalar_name(self, scalar_name):
+        return f"{scalar_name}_{self.mode}"
+
+    def add_scalar(self, scalar_name, scalar):
+        self.wandb.log({
+            self._scalar_name(scalar_name): scalar,
+        }, step=self.step)
+
+    def add_scalars(self, tag, scalars):
+        self.wandb.log({
+            **{f"{scalar_name}_{tag}_{self.mode}": scalar for scalar_name, scalar in
+               scalars.items()}
+        }, step=self.step)
+
+    def add_image(self, scalar_name, image):
+        self.wandb.log({
+            self._scalar_name(scalar_name): self.wandb.Image(image)
+        }, step=self.step)
+
+    def add_audio(self, scalar_name, audio, sample_rate=None):
+        audio = audio.detach().cpu().numpy().T
+        self.wandb.log({
+            self._scalar_name(scalar_name): self.wandb.Audio(audio, sample_rate=sample_rate)
+        }, step=self.step)
+
+    def add_text(self, scalar_name, text):
+        self.wandb.log({
+            self._scalar_name(scalar_name): self.wandb.Html(text)
+        }, step=self.step)
+
+    def add_histogram(self, scalar_name, hist, bins=None):
+        hist = hist.detach().cpu().numpy()
+        np_hist = np.histogram(hist, bins=bins)
+        if np_hist[0].shape[0] > 512:
+            np_hist = np.histogram(hist, bins=512)
+
+        hist = self.wandb.Histogram(
+            np_histogram=np_hist
+        )
+
+        self.wandb.log({
+            self._scalar_name(scalar_name): hist
+        }, step=self.step)
+
+    def add_table(self, table_name, table: pd.DataFrame):
+        self.wandb.log({self._scalar_name(table_name): wandb.Table(dataframe=table)},
+                       step=self.step)
+
+    def add_images(self, scalar_name, images):
+        raise NotImplementedError()
+
+    def add_pr_curve(self, scalar_name, scalar):
+        raise NotImplementedError()
+
+    def add_embedding(self, scalar_name, scalar):
+        raise NotImplementedError()
diff --git a/automatic-speech-recognition/hw_asr/loss/CTCLossWrapper.py b/automatic-speech-recognition/hw_asr/loss/CTCLossWrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..a57dd573cb3a72b1835d5868f7cda12f9d075d17
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/loss/CTCLossWrapper.py
@@ -0,0 +1,20 @@
+import torch
+from torch import Tensor
+from torch.nn import CTCLoss
+
+
+class CTCLossWrapper(CTCLoss):
+    
+    def __init__(self):
+        super().__init__(zero_infinity=True)
+    
+    def forward(self, log_probs, log_probs_length, text_encoded, text_encoded_length,
+                **batch) -> Tensor:
+        log_probs_t = torch.transpose(log_probs, 0, 1)
+
+        return super().forward(
+                log_probs=log_probs_t,
+                targets=text_encoded,
+                input_lengths=log_probs_length,
+                target_lengths=text_encoded_length,
+            )
diff --git a/automatic-speech-recognition/hw_asr/loss/__init__.py b/automatic-speech-recognition/hw_asr/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8968fea952bb13d7e1b6177f40a71d363307681d
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/loss/__init__.py
@@ -0,0 +1,5 @@
+from hw_asr.loss.CTCLossWrapper import CTCLossWrapper as CTCLoss
+
+__all__ = [
+    "CTCLoss"
+]
diff --git a/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-310.pyc b/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91ccce2b7c95f54fe4aa3b850ebc7d8a69e6af73
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-311.pyc b/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3376b136391e850fc2b3b867a9baf8338ebf996f
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/loss/__pycache__/CTCLossWrapper.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a5521589d921df6e36c3475f2b6326e9ce0b093
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..826e6f0d2b1d8977f745c179c5ad98ba89b808ab
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/loss/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/metric/__init__.py b/automatic-speech-recognition/hw_asr/metric/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80b677c33c5e549c030dedabe582eadd70feac90
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/metric/__init__.py
@@ -0,0 +1,11 @@
+from hw_asr.metric.cer_metric import ArgmaxCERMetric, BeamSearchCERMetric, LanguageModelCERMetric
+from hw_asr.metric.wer_metric import ArgmaxWERMetric, BeamSearchWERMetric, LanguageModelWERMetric
+
+__all__ = [
+    "ArgmaxCERMetric",
+    "BeamSearchCERMetric",
+    "LanguageModelCERMetric",
+    "ArgmaxWERMetric",
+    "BeamSearchWERMetric",
+    "LanguageModelWERMetric"
+]
diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c7c0bba4321305de6648145cdf953eabe997137
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d81781c2a6501997ee8fdb38f9260d225f0f0a0d
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-310.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b998cbef6ef1fe735cb4fb9f401b0ad3ebf91502
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-311.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43cdcc20cb50ada9a2d460dbce7afcda26c06395
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/cer_metric.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-310.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..663cbcf684425cee893ac1c9ff3cb270c4ef6bd2
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-311.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fb94057757cfdcbba8512a3c68f82fe9616d002
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/utils.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-310.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2de7f56bdc75ecc25e87e918781e4305cf509ef
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-311.pyc b/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f9901c140b702440645ef5ec03b11728e3ede18
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/metric/__pycache__/wer_metric.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/metric/cer_metric.py b/automatic-speech-recognition/hw_asr/metric/cer_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..54559a3dad1daf31d292ac75bf7791d522fbe078
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/metric/cer_metric.py
@@ -0,0 +1,68 @@
+from typing import List
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from hw_asr.base.base_metric import BaseMetric
+from hw_asr.base.base_text_encoder import BaseTextEncoder
+from hw_asr.metric.utils import calc_cer
+
+
+class ArgmaxCERMetric(BaseMetric):
+    def __init__(self, text_encoder: BaseTextEncoder, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.text_encoder = text_encoder
+
+    def __call__(self, log_probs: Tensor, log_probs_length: Tensor, text: List[str], **kwargs):
+        cers = []
+        predictions = torch.argmax(log_probs.cpu(), dim=-1).numpy()
+        lengths = log_probs_length.detach().numpy()
+        for log_prob_vec, length, target_text in zip(predictions, lengths, text):
+            target_text = BaseTextEncoder.normalize_text(target_text)
+            if hasattr(self.text_encoder, "ctc_decode"):
+                pred_text = self.text_encoder.ctc_decode(log_prob_vec[:length])
+            else:
+                pred_text = self.text_encoder.decode(log_prob_vec[:length])
+            cers.append(calc_cer(target_text, pred_text))
+        return sum(cers) / len(cers)
+
+
+class BeamSearchCERMetric(BaseMetric):
+    def __init__(self, text_encoder: BaseTextEncoder, beam_size: int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.text_encoder = text_encoder
+        self.beam_size = beam_size
+
+    def __call__(self, log_probs: Tensor, log_probs_length: Tensor, text: List[str], **kwargs):
+        cers = []
+        probs = np.exp(log_probs.detach().cpu().numpy())
+        lengths = log_probs_length.detach().numpy()
+        for prob, length, target_text in zip(probs, lengths, text):
+            target_text = BaseTextEncoder.normalize_text(target_text)
+            if hasattr(self.text_encoder, "ctc_beam_search"):
+                pred_text = self.text_encoder.ctc_beam_search(prob[:length], self.beam_size)
+            else:
+                assert False
+            cers.append(calc_cer(target_text, pred_text))
+        return sum(cers) / len(cers)
+
+
+class LanguageModelCERMetric(BaseMetric):
+    def __init__(self, text_encoder: BaseTextEncoder, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.text_encoder = text_encoder
+
+    def __call__(self, logits: Tensor, log_probs_length: Tensor, text: List[str], **kwargs):
+        cers = []
+        logits = logits.detach().cpu().numpy()
+        lengths = log_probs_length.detach().numpy()
+        for logit, length, target_text in zip(logits, lengths, text):
+            target_text = BaseTextEncoder.normalize_text(target_text)
+            if hasattr(self.text_encoder, "ctc_lm_beam_search"):
+                pred_text = self.text_encoder.ctc_lm_beam_search(logit[:length])
+            else:
+                assert False
+            cers.append(calc_cer(target_text, pred_text))
+        return sum(cers) / len(cers)
+
diff --git a/automatic-speech-recognition/hw_asr/metric/utils.py b/automatic-speech-recognition/hw_asr/metric/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e38484975cef2424e8a19f1680cee029e945ae61
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/metric/utils.py
@@ -0,0 +1,18 @@
+import editdistance
+# Don't forget to support cases when target_text == ''
+
+
+def calc_cer(target_text, predicted_text) -> float:
+    # TODO: your code here
+    return editdistance.eval(target_text, predicted_text) / len(target_text)
+
+
+def calc_wer(target_text, predicted_text) -> float:
+    # TODO: your code here
+    if not target_text:
+        if predicted_text:
+            return 1
+        return 0
+    target_text_splitted = target_text.split(' ')
+    predicted_text_splitted = predicted_text.split(' ')
+    return editdistance.eval(target_text_splitted, predicted_text_splitted) / len(target_text_splitted)
diff --git a/automatic-speech-recognition/hw_asr/metric/wer_metric.py b/automatic-speech-recognition/hw_asr/metric/wer_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bc03ce0b867e9342b835687fc0279f2a4d12aa3
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/metric/wer_metric.py
@@ -0,0 +1,67 @@
+from typing import List
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from hw_asr.base.base_metric import BaseMetric
+from hw_asr.base.base_text_encoder import BaseTextEncoder
+from hw_asr.metric.utils import calc_wer
+
+
+class ArgmaxWERMetric(BaseMetric):
+    def __init__(self, text_encoder: BaseTextEncoder, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.text_encoder = text_encoder
+
+    def __call__(self, log_probs: Tensor, log_probs_length: Tensor, text: List[str], **kwargs):
+        wers = []
+        predictions = torch.argmax(log_probs.cpu(), dim=-1).numpy()
+        lengths = log_probs_length.detach().numpy()
+        for log_prob_vec, length, target_text in zip(predictions, lengths, text):
+            target_text = BaseTextEncoder.normalize_text(target_text)
+            if hasattr(self.text_encoder, "ctc_decode"):
+                pred_text = self.text_encoder.ctc_decode(log_prob_vec[:length])
+            else:
+                pred_text = self.text_encoder.decode(log_prob_vec[:length])
+            wers.append(calc_wer(target_text, pred_text))
+        return sum(wers) / len(wers)
+
+
+class BeamSearchWERMetric(BaseMetric):
+    def __init__(self, text_encoder: BaseTextEncoder, beam_size: int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.text_encoder = text_encoder
+        self.beam_size = beam_size
+
+    def __call__(self, log_probs: Tensor, log_probs_length: Tensor, text: List[str], **kwargs):
+        wers = []
+        probs = np.exp(log_probs.detach().cpu().numpy())
+        lengths = log_probs_length.detach().numpy()
+        for prob, length, target_text in zip(probs, lengths, text):
+            target_text = BaseTextEncoder.normalize_text(target_text)
+            if hasattr(self.text_encoder, "ctc_beam_search"):
+                pred_text = self.text_encoder.ctc_beam_search(prob[:length], self.beam_size)
+            else:
+                assert False
+            wers.append(calc_wer(target_text, pred_text))
+        return sum(wers) / len(wers)
+
+
+class LanguageModelWERMetric(BaseMetric):
+    def __init__(self, text_encoder: BaseTextEncoder, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.text_encoder = text_encoder
+
+    def __call__(self, logits: Tensor, log_probs_length: Tensor, text: List[str], **kwargs):
+        wers = []
+        logits = logits.detach().cpu().numpy()
+        lengths = log_probs_length.detach().numpy()
+        for logit, length, target_text in zip(logits, lengths, text):
+            target_text = BaseTextEncoder.normalize_text(target_text)
+            if hasattr(self.text_encoder, "ctc_lm_beam_search"):
+                pred_text = self.text_encoder.ctc_lm_beam_search(logit[:length])
+            else:
+                assert False
+            wers.append(calc_wer(target_text, pred_text))
+        return sum(wers) / len(wers)
diff --git a/automatic-speech-recognition/hw_asr/model/__init__.py b/automatic-speech-recognition/hw_asr/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f0944e4bad49606bed457888ea8cc5001756e9
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/model/__init__.py
@@ -0,0 +1,7 @@
+from hw_asr.model.baseline_model import BaselineModel
+from hw_asr.model.deepspeech2_model import DeepSpeech2Model
+
+__all__ = [
+    "BaselineModel",
+    "DeepSpeech2Model",
+]
diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b7e11462c91ca31d516b2d0713cbe93f7169df6
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32df7c0ed01c13feffa4418603a6e6332e44f112
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-310.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1b39436e727354077f6d7fe6be3513d1df086b5
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-311.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fb1054c2322a4b0d0f517525cb90ec0b252b824
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/baseline_model.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-310.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3938bc9b0d8cef8a17cb375aa0eaa4c1d8def54
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-311.pyc b/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5d84b7ca5b16309977560be17d96d06e70ede51
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/model/__pycache__/deepspeech2_model.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/model/baseline_model.py b/automatic-speech-recognition/hw_asr/model/baseline_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d29513ecaadb7aaefddbf89a1852fba3bbe2bdb
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/model/baseline_model.py
@@ -0,0 +1,24 @@
+from torch import nn
+from torch.nn import Sequential
+
+from hw_asr.base import BaseModel
+
+
+class BaselineModel(BaseModel):
+    def __init__(self, n_feats, n_class, fc_hidden=512, **batch):
+        super().__init__(n_feats, n_class, **batch)
+        self.net = Sequential(
+            # people say it can aproximate any function...
+            nn.Linear(in_features=n_feats, out_features=fc_hidden),
+            nn.ReLU(),
+            nn.Linear(in_features=fc_hidden, out_features=fc_hidden),
+            nn.ReLU(),
+            nn.Linear(in_features=fc_hidden, out_features=n_class)
+        )
+
+    def forward(self, spectrogram, **batch):
+        print(self.net(spectrogram.transpose(1, 2)).shape)
+        return {"logits": self.net(spectrogram.transpose(1, 2))}
+
+    def transform_input_lengths(self, input_lengths):
+        return input_lengths  # we don't reduce time dimension here
diff --git a/automatic-speech-recognition/hw_asr/model/deepspeech2_model.py b/automatic-speech-recognition/hw_asr/model/deepspeech2_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..605681fc8223df1e9c0dda5982174a02a3877efb
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/model/deepspeech2_model.py
@@ -0,0 +1,84 @@
+import torch
+from torch import nn
+
+from hw_asr.base import BaseModel
+
+
+class RNNwBatchNorm(nn.Module):
+    def __init__(self, input_size, hidden_size, rnn_dropout):
+        super().__init__()
+        self.rnn = nn.GRU(input_size, hidden_size, dropout=rnn_dropout, batch_first=False, bidirectional=True)
+        self.norm = nn.BatchNorm1d(hidden_size)
+
+    def forward(self, x, h=None):
+        # N x T x input_size
+        x, h = self.rnn(x, h)
+        # T x N x (2 * hidden_size)
+        x = x.view(x.shape[0], x.shape[1], 2, -1).sum(2)
+        # T x N x hidden_size
+        t_dim, n_dim = x.shape[0], x.shape[1]
+        x = x.view((t_dim * n_dim, -1))
+        x = self.norm(x)
+        x = x.view((t_dim, n_dim, -1)).contiguous()
+        return x, h
+
+
+# https://proceedings.mlr.press/v48/amodei16.pdf
+class DeepSpeech2Model(BaseModel):
+    def __init__(self, n_feats, n_rnn_layers, rnn_hidden_size, rnn_dropout, n_class):
+        assert n_rnn_layers >= 1
+        super().__init__(n_feats, n_class)
+
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels=1, out_channels=32, padding=(20, 5), kernel_size=(41, 11), stride=(2, 2)),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+
+            nn.Conv2d(in_channels=32, out_channels=32, padding=(10, 5), kernel_size=(21, 11), stride=(2, 2)),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+
+            nn.Conv2d(in_channels=32, out_channels=96, padding=(10, 5), kernel_size=(21, 11), stride=(2, 1)),
+            nn.BatchNorm2d(96),
+            nn.ReLU(),
+        )
+
+        rnn_input_size = (n_feats + 2 * 20 - 41) // 2 + 1
+        rnn_input_size = (rnn_input_size + 2 * 10 - 21) // 2 + 1
+        rnn_input_size = (rnn_input_size + 2 * 10 - 21) // 2 + 1
+        rnn_input_size *= 96
+        self.rnns = nn.Sequential(
+            RNNwBatchNorm(rnn_input_size, rnn_hidden_size, rnn_dropout),
+            *(RNNwBatchNorm(rnn_hidden_size, rnn_hidden_size, rnn_dropout) for _ in range(n_rnn_layers - 1))
+        )
+
+        self.fc = nn.Linear(in_features=rnn_hidden_size, out_features=n_class)
+        self.softmax = nn.Softmax(dim=2)
+
+    def forward(self, spectrogram, spectrogram_length, **batch):
+        # N x big_F x big_T
+        x = self.conv(spectrogram.unsqueeze(1))
+        # N x C x F x T
+        x = x.view(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
+        # N x (C * F) x T
+        x = x.transpose(1, 2).transpose(0, 1).contiguous()
+        # T x N x (C * F)
+        h = None
+        for rnn in self.rnns:
+            x, h = rnn(x, h)
+        # T x N x rnn_hidden_size
+        t_dim, n_dim = x.shape[0], x.shape[1]
+        x = x.view((t_dim * n_dim, -1))
+        x = self.fc(x)
+        x = x.view((t_dim, n_dim, -1)).transpose(0, 1)
+        # N x T x n_class
+        return {"logits": x}
+
+    def transform_input_lengths(self, input_lengths):
+        t_dim = input_lengths.max()
+
+        t_dim = (t_dim + 2 * 5 - 11) // 2 + 1
+        t_dim = (t_dim + 2 * 5 - 11) // 2 + 1
+        t_dim = (t_dim + 2 * 5 - 11) + 1
+
+        return torch.zeros_like(input_lengths).fill_(t_dim)
\ No newline at end of file
diff --git a/automatic-speech-recognition/hw_asr/tests/__init__.py b/automatic-speech-recognition/hw_asr/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/automatic-speech-recognition/hw_asr/tests/config.json b/automatic-speech-recognition/hw_asr/tests/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8491ebc8ea3448f5a59b5cd17d76df23eefb3ecf
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/tests/config.json
@@ -0,0 +1,100 @@
+{
+  "name": "test_config",
+  "n_gpu": 1,
+  "preprocessing": {
+    "sr": 16000,
+    "spectrogram": {
+      "type": "MelSpectrogram",
+      "args": {
+      }
+    },
+    "log_spec": true
+  },
+  "augmentations": {
+    "wave": [],
+    "spectrogram": []
+  },
+  "arch": {
+    "type": "BaselineModel",
+    "args": {
+      "n_feats": 128,
+      "fc_hidden": 512
+    }
+  },
+  "data": {
+    "train": {
+      "batch_size": 20,
+      "num_workers": 0,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean",
+            "max_audio_length": 20.0,
+            "max_text_length": 200
+          }
+        }
+      ]
+    },
+    "val": {
+      "batch_size": 20,
+      "num_workers": 0,
+      "datasets": [
+        {
+          "type": "LibrispeechDataset",
+          "args": {
+            "part": "dev-clean",
+            "max_audio_length": 20.0,
+            "max_text_length": 200
+          }
+        }
+      ]
+    }
+  },
+  "optimizer": {
+    "type": "SGD",
+    "args": {
+      "lr": 3e-4
+    }
+  },
+  "loss": {
+    "type": "CTCLoss",
+    "args": {}
+  },
+  "metrics": [
+    {
+      "type": "ArgmaxWERMetric",
+      "args": {
+        "name": "WER (argmax)"
+      }
+    },
+    {
+      "type": "ArgmaxCERMetric",
+      "args": {
+        "name": "CER (argmax)"
+      }
+    }
+  ],
+  "lr_scheduler": {
+    "type": "OneCycleLR",
+    "args": {
+      "steps_per_epoch": 100,
+      "epochs": 50,
+      "anneal_strategy": "cos",
+      "max_lr": 4e-3,
+      "pct_start": 0.2
+    }
+  },
+  "trainer": {
+    "epochs": 50,
+    "save_dir": "saved/",
+    "save_period": 5,
+    "verbosity": 2,
+    "monitor": "min val_loss",
+    "early_stop": 100,
+    "visualize": "wandb",
+    "wandb_project": "asr_project",
+    "len_epoch": 100,
+    "grad_norm_clip": 10
+  }
+}
diff --git a/automatic-speech-recognition/hw_asr/tests/test_config.py b/automatic-speech-recognition/hw_asr/tests/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..502cac4036c5dd9568c44cfab4a5e0f89dc9510b
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/tests/test_config.py
@@ -0,0 +1,12 @@
+import json
+import unittest
+
+from hw_asr.tests.utils import clear_log_folder_after_use
+from hw_asr.utils.parse_config import ConfigParser
+
+
+class TestConfig(unittest.TestCase):
+    def test_create(self):
+        config_parser = ConfigParser.get_test_configs()
+        with clear_log_folder_after_use(config_parser):
+            json.dumps(config_parser.config, indent=2)
diff --git a/automatic-speech-recognition/hw_asr/tests/test_dataloader.py b/automatic-speech-recognition/hw_asr/tests/test_dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..b71f5d18e3011389d2704d810226afd22b135543
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/tests/test_dataloader.py
@@ -0,0 +1,55 @@
+import unittest
+
+from tqdm import tqdm
+
+from hw_asr.collate_fn.collate import collate_fn
+from hw_asr.datasets import LibrispeechDataset
+from hw_asr.tests.utils import clear_log_folder_after_use
+from hw_asr.utils.object_loading import get_dataloaders
+from hw_asr.utils.parse_config import ConfigParser
+
+
+class TestDataloader(unittest.TestCase):
+    def test_collate_fn(self):
+        config_parser = ConfigParser.get_test_configs()
+        with clear_log_folder_after_use(config_parser):
+            ds = LibrispeechDataset(
+                "dev-clean", text_encoder=config_parser.get_text_encoder(),
+                config_parser=config_parser
+            )
+
+            batch_size = 3
+            batch = collate_fn([ds[i] for i in range(batch_size)])
+
+            self.assertIn("spectrogram", batch)  # torch.tensor
+            batch_size_dim, feature_length_dim, time_dim = batch["spectrogram"].shape
+            self.assertEqual(batch_size_dim, batch_size)
+            self.assertEqual(feature_length_dim, 128)
+
+            self.assertIn("text_encoded", batch)  # [int] torch.tensor
+            # joined and padded indexes representation of transcriptions
+            batch_size_dim, text_length_dim = batch["text_encoded"].shape
+            self.assertEqual(batch_size_dim, batch_size)
+
+            self.assertIn("text_encoded_length", batch)  # [int] torch.tensor
+            # contains lengths of each text entry
+            self.assertEqual(len(batch["text_encoded_length"].shape), 1)
+            batch_size_dim = batch["text_encoded_length"].shape[0]
+            self.assertEqual(batch_size_dim, batch_size)
+
+            self.assertIn("text", batch)  # List[str]
+            # simple list of initial normalized texts
+            batch_size_dim = len(batch["text"])
+            self.assertEqual(batch_size_dim, batch_size)
+
+    def test_dataloaders(self):
+        _TOTAL_ITERATIONS = 10
+        config_parser = ConfigParser.get_test_configs()
+        with clear_log_folder_after_use(config_parser):
+            dataloaders = get_dataloaders(config_parser, config_parser.get_text_encoder())
+            for part in ["train", "val"]:
+                dl = dataloaders[part]
+                for i, batch in tqdm(enumerate(iter(dl)), total=_TOTAL_ITERATIONS,
+                                     desc=f"Iterating over {part}"):
+                    if i >= _TOTAL_ITERATIONS:
+                        break
diff --git a/automatic-speech-recognition/hw_asr/tests/test_datasets.py b/automatic-speech-recognition/hw_asr/tests/test_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1069274054723c840db33c407391e420828ec95
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/tests/test_datasets.py
@@ -0,0 +1,100 @@
+import unittest
+
+import torch
+
+from hw_asr.datasets import LibrispeechDataset, CustomDirAudioDataset, CustomAudioDataset
+from hw_asr.tests.utils import clear_log_folder_after_use
+from hw_asr.text_encoder.ctc_char_text_encoder import CTCCharTextEncoder
+from hw_asr.utils import ROOT_PATH
+from hw_asr.utils.parse_config import ConfigParser
+
+
+class TestDataset(unittest.TestCase):
+    def test_librispeech(self):
+        config_parser = ConfigParser.get_test_configs()
+        with clear_log_folder_after_use(config_parser):
+            ds = LibrispeechDataset(
+                "dev-clean",
+                text_encoder=config_parser.get_text_encoder(),
+                config_parser=config_parser,
+                max_text_length=140,
+                max_audio_length=13,
+                limit=10,
+            )
+            self._assert_training_example_is_good(ds[0])
+
+    def test_custom_dir_dataset(self):
+        config_parser = ConfigParser.get_test_configs()
+        with clear_log_folder_after_use(config_parser):
+            audio_dir = str(ROOT_PATH / "test_data" / "audio")
+            transc_dir = str(ROOT_PATH / "test_data" / "transcriptions")
+
+            ds = CustomDirAudioDataset(
+                audio_dir,
+                transc_dir,
+                text_encoder=config_parser.get_text_encoder(),
+                config_parser=config_parser,
+                limit=10,
+                max_audio_length=8,
+                max_text_length=130,
+            )
+            self._assert_training_example_is_good(ds[0])
+
+    def test_custom_dataset(self):
+        config_parser = ConfigParser.get_test_configs()
+        with clear_log_folder_after_use(config_parser):
+            audio_path = ROOT_PATH / "test_data" / "audio"
+            transc_path = ROOT_PATH / "test_data" / "transcriptions"
+            with (transc_path / "84-121550-0000.txt").open() as f:
+                transcription = f.read().strip()
+            data = [
+                {
+                    "path": str(audio_path / "84-121550-0001.flac"),
+                },
+                {
+                    "path": str(audio_path / "84-121550-0000.flac"),
+                    "text": transcription
+                }
+            ]
+
+            ds = CustomAudioDataset(
+                data=data,
+                text_encoder=config_parser.get_text_encoder(),
+                config_parser=config_parser,
+            )
+            self._assert_training_example_is_good(ds[0], contains_text=False)
+            self._assert_training_example_is_good(ds[1])
+
+    def _assert_training_example_is_good(self, training_example: dict, contains_text=True):
+
+        for field, expected_type in [
+            ("audio", torch.Tensor),
+            ("spectrogram", torch.Tensor),
+            ("duration", float),
+            ("audio_path", str),
+            ("text", str),
+            ("text_encoded", torch.Tensor)
+        ]:
+            self.assertIn(field, training_example, f"Error during checking field {field}")
+            self.assertIsInstance(training_example[field], expected_type,
+                                  f"Error during checking field {field}")
+
+        # check waveform dimensions
+        batch_dim, audio_dim, = training_example["audio"].size()
+        self.assertEqual(batch_dim, 1)
+        self.assertGreater(audio_dim, 1)
+
+        # check spectrogram dimensions
+        batch_dim, freq_dim, time_dim = training_example["spectrogram"].size()
+        self.assertEqual(batch_dim, 1)
+        self.assertEqual(freq_dim, 128)
+        self.assertGreater(time_dim, 1)
+
+        # check text tensor dimensions
+        batch_dim, length_dim, = training_example["text_encoded"].size()
+        self.assertEqual(batch_dim, 1)
+        if contains_text:
+            self.assertGreater(length_dim, 1)
+        else:
+            self.assertEqual(length_dim, 0)
+            self.assertEqual(training_example["text"], "")
diff --git a/automatic-speech-recognition/hw_asr/tests/test_text_encoder.py b/automatic-speech-recognition/hw_asr/tests/test_text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fae65114d3163f60ba3564de2f964354ea31ad11
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/tests/test_text_encoder.py
@@ -0,0 +1,22 @@
+import unittest
+
+from hw_asr.text_encoder.ctc_char_text_encoder import CTCCharTextEncoder
+
+
+class TestTextEncoder(unittest.TestCase):
+    def test_ctc_decode(self):
+        text_encoder = CTCCharTextEncoder()
+        text = "i^^ ^w^i^sss^hhh^   i ^^^s^t^aaaar^teee^d " \
+               "dddddd^oooo^in^g tttttttth^iiiis h^^^^^^^^w^ e^a^r^li^er"
+        true_text = "i wish i started doing this hw earlier"
+        inds = [text_encoder.char2ind[c] for c in text]
+        decoded_text = text_encoder.ctc_decode(inds)
+        self.assertIn(decoded_text, true_text)
+
+    # def test_beam_search(self):
+    #     # TODO: (optional) write tests for beam search
+    #     text_encoder = CTCCharTextEncoder()
+
+    #     len(text_encoder.ind2char)
+    #     probs
+    #     text_encoder.ctc_beam_search()
diff --git a/automatic-speech-recognition/hw_asr/tests/utils.py b/automatic-speech-recognition/hw_asr/tests/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..181b03ed8a1b999a84ababbb44573e165ebb8a1b
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/tests/utils.py
@@ -0,0 +1,22 @@
+import platform
+import shutil
+from contextlib import contextmanager
+from time import sleep
+
+from hw_asr.utils.parse_config import ConfigParser
+
+
+@contextmanager
+def clear_log_folder_after_use(config_parser: ConfigParser):
+    # this context manager deletes the log folders weather the body was executed succesfully or not
+    try:
+        yield config_parser
+    finally:
+        if platform.system() == "Windows":
+            # Running unittest on windows results in a delete lock on the log directories just skip
+            # this cleanup for windows and wait 1s to have a different experiment name.
+            # (if you know how to fix it, you are welcome to make pull request)
+            sleep(1)
+        else:
+            shutil.rmtree(config_parser.save_dir)
+            shutil.rmtree(config_parser.log_dir)
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__init__.py b/automatic-speech-recognition/hw_asr/text_encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ffac14abbfc1939926914d6244e6e689a953cf3
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/text_encoder/__init__.py
@@ -0,0 +1,7 @@
+from .char_text_encoder import CharTextEncoder
+from .ctc_char_text_encoder import CTCCharTextEncoder
+
+__all__ = [
+    "CharTextEncoder",
+    "CTCCharTextEncoder"
+]
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5be326471283cf40d0cd9caa188fda5b7dcd8372
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..155dfc0394e51d8e8d82300274cd9c4179eb339d
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-310.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bf0ce7e443a862db9acf39f6b37c0a8228b83ac
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-311.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..daafd6df7d0a7d0bcad85358531cbc6480665abc
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/char_text_encoder.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-310.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0bf3d80535bff4e5639500722938f72ef3a26842
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-311.pyc b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63ec6327d8ea6fe2bdd3ddfd407a4dd95858fd7d
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/text_encoder/__pycache__/ctc_char_text_encoder.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/char_text_encoder.py b/automatic-speech-recognition/hw_asr/text_encoder/char_text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1f8ef0826cf2ce0e6c34fce0d1cabb88d0acbb6
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/text_encoder/char_text_encoder.py
@@ -0,0 +1,51 @@
+import json
+from pathlib import Path
+from string import ascii_lowercase
+from typing import List, Union
+
+import numpy as np
+from torch import Tensor
+
+from hw_asr.base.base_text_encoder import BaseTextEncoder
+
+
+class CharTextEncoder(BaseTextEncoder):
+
+    def __init__(self, alphabet: List[str] = None):
+        if alphabet is None:
+            alphabet = list(ascii_lowercase + ' ')
+        self.alphabet = alphabet
+        self.ind2char = {k: v for k, v in enumerate(sorted(alphabet))}
+        self.char2ind = {v: k for k, v in self.ind2char.items()}
+
+    def __len__(self):
+        return len(self.ind2char)
+
+    def __getitem__(self, item: int):
+        assert type(item) is int
+        return self.ind2char[item]
+
+    def encode(self, text) -> Tensor:
+        text = self.normalize_text(text)
+        try:
+            return Tensor([self.char2ind[char] for char in text]).unsqueeze(0)
+        except KeyError as e:
+            unknown_chars = set([char for char in text if char not in self.char2ind])
+            raise Exception(
+                f"Can't encode text '{text}'. Unknown chars: '{' '.join(unknown_chars)}'")
+
+    def decode(self, vector: Union[Tensor, np.ndarray, List[int]]):
+        return ''.join([self.ind2char[int(ind)] for ind in vector]).strip()
+
+    def dump(self, file):
+        with Path(file).open('w') as f:
+            json.dump(self.ind2char, f)
+
+    @classmethod
+    def from_file(cls, file):
+        with Path(file).open() as f:
+            ind2char = json.load(f)
+        a = cls([])
+        a.ind2char = ind2char
+        a.char2ind = {v: k for k, v in ind2char}
+        return a
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/ctc_char_text_encoder.py b/automatic-speech-recognition/hw_asr/text_encoder/ctc_char_text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..90e23b5fcd5b425e8219e7a6d016289d65ce86f0
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/text_encoder/ctc_char_text_encoder.py
@@ -0,0 +1,86 @@
+from typing import List, NamedTuple
+
+import torch
+from pyctcdecode import build_ctcdecoder
+
+
+from hw_asr.base.base_text_encoder import BaseTextEncoder
+from .char_text_encoder import CharTextEncoder
+from collections import defaultdict
+
+
+class Hypothesis(NamedTuple):
+    text: str
+    prob: float
+
+
+class CTCCharTextEncoder(CharTextEncoder):
+    EMPTY_TOK = "^"
+
+    def __init__(self, alphabet: List[str] = None, kenlm_model_path: str = None, unigrams_path: str = None):
+        super().__init__(alphabet)
+        vocab = [self.EMPTY_TOK] + list(self.alphabet)
+        self.ind2char = dict(enumerate(vocab))
+        self.char2ind = {v: k for k, v in self.ind2char.items()}
+        if kenlm_model_path is not None:
+            with open(unigrams_path) as f:
+                unigrams = [line.strip() for line in f.readlines()]
+            self.decoder = build_ctcdecoder(labels=[""] + self.alphabet, kenlm_model_path=kenlm_model_path, unigrams=unigrams)
+
+    def ctc_decode(self, inds: List[int]) -> str:
+        # TODO: your code here
+        result = []
+        last_char = self.EMPTY_TOK
+        for ind in inds:
+            cur_char = self.ind2char[ind]
+            if cur_char != self.EMPTY_TOK and last_char != cur_char:
+                result.append(cur_char)
+            last_char = cur_char
+        return ''.join(result)
+
+    def ctc_beam_search(self, probs: torch.tensor, beam_size: int) -> str:
+        """
+            Performs beam search and returns a list of pairs (hypothesis, hypothesis probability).
+        """
+        assert len(probs.shape) == 2
+        char_length, voc_size = probs.shape
+        assert voc_size == len(self.ind2char)
+        hypos: List[Hypothesis] = []
+        # TODO: your code here
+
+        def extend_and_merge(frame, state):
+            new_state = defaultdict(float)
+            for next_char_index, next_char_proba in enumerate(frame):
+                for (pref, last_char), pref_proba in state.items():
+                    next_char = self.ind2char[next_char_index]
+                    if next_char == last_char:
+                        new_pref = pref
+                    else:
+                        if next_char != self.EMPTY_TOK:
+                            new_pref = pref + next_char
+                        else:
+                            new_pref = pref
+                        last_char = next_char
+                    new_state[(new_pref, last_char)] += pref_proba * next_char_proba
+            return new_state
+
+        def truncate(state, beam_size):
+            state_list = list(state.items())
+            state_list.sort(key=lambda x: -x[1])
+            return dict(state_list[:beam_size])
+
+        state = {('', self.EMPTY_TOK): 1.0}
+        for frame in probs:
+            state = extend_and_merge(frame, state)
+            state = truncate(state, beam_size)
+        state_list = list(state.items())
+        state_list.sort(key=lambda x: -x[1])
+        
+        # for state in state_list:
+        #     hypos.append(Hypothesis(state[0][0], state[1]))
+        
+        return state_list[0][0][0]
+    
+    def ctc_lm_beam_search(self, logits: torch.tensor) -> str:
+        assert self.decoder is not None
+        return self.decoder.decode(logits, beam_width=500).lower()
\ No newline at end of file
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/fix_vocab.py b/automatic-speech-recognition/hw_asr/text_encoder/fix_vocab.py
new file mode 100644
index 0000000000000000000000000000000000000000..265381035a09cd61a20a9980805f04f7628aa5d7
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/text_encoder/fix_vocab.py
@@ -0,0 +1,9 @@
+fin = open("hw_asr/text_encoder/librispeech-vocab.txt", "r")
+fout = open("hw_asr/text_encoder/librispeech-fixed-vocab.txt", "w+")
+
+while line := fin.readline():
+    line = line.lower().replace("'", "")
+    print(line, end="", file=fout)
+
+fin.close()
+fout.close()
diff --git a/automatic-speech-recognition/hw_asr/text_encoder/lower_model.py b/automatic-speech-recognition/hw_asr/text_encoder/lower_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f90332627a79cdbcabae5515fae11a6f1790a724
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/text_encoder/lower_model.py
@@ -0,0 +1,7 @@
+model_path = "3-gram.arpa"
+lower_model_path = "lower_3-gram.arpa"
+
+with open(model_path, 'r') as f1:
+    with open(lower_model_path, "w") as f2:
+        for line in f1:
+            f2.write(line.lower())
\ No newline at end of file
diff --git a/automatic-speech-recognition/hw_asr/trainer/__init__.py b/automatic-speech-recognition/hw_asr/trainer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c0a8a4a91724515aee0aecd8217cfe16ee5ec80
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/trainer/__init__.py
@@ -0,0 +1 @@
+from .trainer import *
diff --git a/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7ab25f648aa074db07b92d5c059d85f0e049a26
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f634f50ab8078b682cbe615cf6d8852962779d8
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/trainer/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-310.pyc b/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86f0ec4817d9ef593eebf8ba1cdd959c47bbb4bc
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-311.pyc b/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d535aec2f2056fee1505924742f4becb89364dd
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/trainer/__pycache__/trainer.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/trainer/trainer.py b/automatic-speech-recognition/hw_asr/trainer/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4b6b786bdd04ebf095764a3a5dfcdbd22f68fe8
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/trainer/trainer.py
@@ -0,0 +1,274 @@
+import random
+from pathlib import Path
+from random import shuffle
+
+import PIL
+import pandas as pd
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.nn.utils import clip_grad_norm_
+from torchvision.transforms import ToTensor
+from tqdm import tqdm
+
+from hw_asr.base import BaseTrainer
+from hw_asr.base.base_text_encoder import BaseTextEncoder
+from hw_asr.logger.utils import plot_spectrogram_to_buf
+from hw_asr.metric.utils import calc_cer, calc_wer
+from hw_asr.utils import inf_loop, MetricTracker
+
+
+class Trainer(BaseTrainer):
+    """
+    Trainer class
+    """
+
+    def __init__(
+        self,
+        model,
+        criterion,
+        metrics,
+        optimizer,
+        config,
+        device,
+        dataloaders,
+        text_encoder,
+        lr_scheduler=None,
+        len_epoch=None,
+        skip_oom=True,
+    ):
+        super().__init__(model, criterion, metrics, optimizer, config, device)
+        self.skip_oom = skip_oom
+        self.text_encoder = text_encoder
+        self.config = config
+        self.train_dataloader = dataloaders["train"]
+        if len_epoch is None:
+            # epoch-based training
+            self.len_epoch = len(self.train_dataloader)
+        else:
+            # iteration-based training
+            self.train_dataloader = inf_loop(self.train_dataloader)
+            self.len_epoch = len_epoch
+        self.evaluation_dataloaders = {k: v for k, v in dataloaders.items() if k != "train"}
+        self.lr_scheduler = lr_scheduler
+        self.log_step = 50
+
+        self.train_metrics = MetricTracker("loss", "grad norm", *[m.name for m in self.metrics], writer=self.writer)
+        self.evaluation_metrics = MetricTracker("loss", *[m.name for m in self.metrics], writer=self.writer)
+
+    @staticmethod
+    def move_batch_to_device(batch, device: torch.device):
+        """
+        Move all necessary tensors to the HPU
+        """
+        for tensor_for_gpu in ["spectrogram", "text_encoded"]:
+            batch[tensor_for_gpu] = batch[tensor_for_gpu].to(device)
+        return batch
+
+    def _clip_grad_norm(self):
+        if self.config["trainer"].get("grad_norm_clip", None) is not None:
+            clip_grad_norm_(self.model.parameters(), self.config["trainer"]["grad_norm_clip"])
+
+    def _train_epoch(self, epoch):
+        """
+        Training logic for an epoch
+
+        :param epoch: Integer, current training epoch.
+        :return: A log that contains average loss and metric in this epoch.
+        """
+        self.model.train()
+        self.train_metrics.reset()
+        self.writer.add_scalar("epoch", epoch)
+        for batch_idx, batch in enumerate(tqdm(self.train_dataloader, desc="train", total=self.len_epoch - 1)):
+            try:
+                batch = self.process_batch(
+                    batch,
+                    is_train=True,
+                    metrics=self.train_metrics,
+                )
+            except RuntimeError as e:
+                if "out of memory" in str(e) and self.skip_oom:
+                    self.logger.warning("OOM on batch. Skipping batch.")
+                    for p in self.model.parameters():
+                        if p.grad is not None:
+                            del p.grad  # free some memory
+                    torch.cuda.empty_cache()
+                    continue
+                else:
+                    raise e
+            self.train_metrics.update("grad norm", self.get_grad_norm())
+            if batch_idx % self.log_step == 0:
+                self.writer.set_step((epoch - 1) * self.len_epoch + batch_idx)
+                self.logger.debug("Train Epoch: {} {} Loss: {:.6f}".format(epoch, self._progress(batch_idx), batch["loss"].item()))
+                self.writer.add_scalar("learning rate", self.lr_scheduler.get_last_lr()[0])
+                self._log_predictions(**batch)
+                self._log_spectrogram(batch["spectrogram"])
+                self._log_scalars(self.train_metrics)
+                # we don't want to reset train metrics at the start of every epoch
+                # because we are interested in recent train metrics
+                last_train_metrics = self.train_metrics.result()
+                self.train_metrics.reset()
+            if batch_idx + 1 >= self.len_epoch:
+                break
+        log = last_train_metrics
+
+        for part, dataloader in self.evaluation_dataloaders.items():
+            val_log = self._evaluation_epoch(epoch, part, dataloader)
+            log.update(**{f"{part}_{name}": value for name, value in val_log.items()})
+
+        return log
+
+    def process_batch(self, batch, is_train: bool, metrics: MetricTracker, part: str = None, epoch: int = None):
+        batch = self.move_batch_to_device(batch, self.device)
+        if is_train:
+            self.optimizer.zero_grad()
+        outputs = self.model(**batch)
+        if type(outputs) is dict:
+            batch.update(outputs)
+        else:
+            batch["logits"] = outputs
+
+        batch["log_probs"] = F.log_softmax(batch["logits"], dim=-1)
+        batch["log_probs_length"] = self.model.transform_input_lengths(batch["spectrogram_length"])
+        batch["loss"] = self.criterion(**batch)
+        if is_train:
+            batch["loss"].backward()
+            self._clip_grad_norm()
+            self.optimizer.step()
+            if self.lr_scheduler is not None:
+                self.lr_scheduler.step()
+
+        metrics.update("loss", batch["loss"].item())
+        for met in self.metrics:
+            is_not_test = is_train or ("val" in part)
+            is_test = not is_not_test
+            hard_to_calc_metric = "beam search" in met.name or "LM" in met.name
+            if hard_to_calc_metric and (is_not_test or (is_test and (epoch % 25) != 0)):
+                continue
+            metrics.update(met.name, met(**batch))
+        return batch
+
+    def _evaluation_epoch(self, epoch, part, dataloader):
+        """
+        Validate after training an epoch
+
+        :param epoch: Integer, current training epoch.
+        :return: A log that contains information about validation
+        """
+        self.model.eval()
+        self.evaluation_metrics.reset()
+        with torch.no_grad():
+            for batch_idx, batch in tqdm(
+                enumerate(dataloader),
+                desc=part,
+                total=len(dataloader),
+            ):
+                batch = self.process_batch(batch, is_train=False, metrics=self.evaluation_metrics, part=part, epoch=epoch)
+            self.writer.set_step(epoch * self.len_epoch, part)
+            self._log_predictions(**batch)
+            self._log_spectrogram(batch["spectrogram"])
+            self._log_scalars(self.evaluation_metrics)
+
+        # add histogram of model parameters to the tensorboard
+        # for name, p in self.model.named_parameters():
+        #     self.writer.add_histogram(name, p, bins="auto")
+        return self.evaluation_metrics.result()
+
+    def _progress(self, batch_idx):
+        base = "[{}/{} ({:.0f}%)]"
+        if hasattr(self.train_dataloader, "n_samples"):
+            current = batch_idx * self.train_dataloader.batch_size
+            total = self.train_dataloader.n_samples
+        else:
+            current = batch_idx
+            total = self.len_epoch
+        return base.format(current, total, 100.0 * current / total)
+
+    def _log_predictions(
+        self,
+        text,
+        logits,
+        log_probs,
+        log_probs_length,
+        audio_path,
+        audio,
+        examples_to_log=10,
+        *args,
+        **kwargs,
+    ):
+        # TODO: implement logging of beam search results
+        if self.writer is None:
+            return
+
+        ids = np.random.choice(len(text), examples_to_log, replace=False)
+        text = [text[i] for i in ids]
+        logits = logits[ids]
+        log_probs = log_probs[ids]
+        log_probs_length = log_probs_length[ids]
+        audio_path = [audio_path[i] for i in ids]
+        audio = [audio[i] for i in ids]
+
+        argmax_inds = log_probs.cpu().argmax(-1).numpy()
+        argmax_inds = [inds[: int(ind_len)] for inds, ind_len in zip(argmax_inds, log_probs_length.numpy())]
+        argmax_texts_raw = [self.text_encoder.decode(inds) for inds in argmax_inds]
+        argmax_texts = [self.text_encoder.ctc_decode(inds) for inds in argmax_inds]
+
+        probs = np.exp(log_probs.detach().cpu().numpy())
+        probs_length = log_probs_length.detach().cpu().numpy()
+        bs_preds = [self.text_encoder.ctc_beam_search(prob[:prob_length], 4) for prob, prob_length in zip(probs, probs_length)]
+
+        logits = logits.detach().cpu().numpy()
+        lm_preds = [self.text_encoder.ctc_lm_beam_search(logit[:length]) for logit, length in zip(logits, probs_length)]
+
+        tuples = list(zip(argmax_texts, bs_preds, lm_preds, text, argmax_texts_raw, audio_path, audio))
+        rows = {}
+        for pred, bs_pred, lm_pred, target, raw_pred, audio_path, audio in tuples:
+            target = BaseTextEncoder.normalize_text(target)
+            wer = calc_wer(target, pred) * 100
+            cer = calc_cer(target, pred) * 100
+
+            bs_wer = calc_wer(target, bs_pred) * 100
+            bs_cer = calc_cer(target, bs_pred) * 100
+
+            lm_wer = calc_wer(target, lm_pred) * 100
+            lm_cer = calc_cer(target, lm_pred) * 100
+
+            rows[Path(audio_path).name] = {
+                "orig_audio": self.writer.wandb.Audio(audio_path),  # inaccurate, but no changes in the template
+                "augm_audio": self.writer.wandb.Audio(audio.squeeze().numpy(), sample_rate=16000),  # inaccurate, but no changes in the template
+                "target": target,
+                "raw pred": raw_pred,
+                "pred": pred,
+                "bs pred": bs_pred,
+                "lm pred": lm_pred,
+                "wer": wer,
+                "cer": cer,
+                "bs wer": bs_wer,
+                "bs cer": bs_cer,
+                "lm wer": lm_wer,
+                "lm cer": lm_cer,
+            }
+        self.writer.add_table("predictions", pd.DataFrame.from_dict(rows, orient="index"))
+
+    def _log_spectrogram(self, spectrogram_batch):
+        spectrogram = random.choice(spectrogram_batch.cpu())
+        image = PIL.Image.open(plot_spectrogram_to_buf(spectrogram))
+        self.writer.add_image("spectrogram", ToTensor()(image))
+
+    @torch.no_grad()
+    def get_grad_norm(self, norm_type=2):
+        parameters = self.model.parameters()
+        if isinstance(parameters, torch.Tensor):
+            parameters = [parameters]
+        parameters = [p for p in parameters if p.grad is not None]
+        total_norm = torch.norm(
+            torch.stack([torch.norm(p.grad.detach(), norm_type).cpu() for p in parameters]),
+            norm_type,
+        )
+        return total_norm.item()
+
+    def _log_scalars(self, metric_tracker: MetricTracker):
+        if self.writer is None:
+            return
+        for metric_name in metric_tracker.keys():
+            self.writer.add_scalar(f"{metric_name}", metric_tracker.avg(metric_name))
diff --git a/automatic-speech-recognition/hw_asr/utils/__init__.py b/automatic-speech-recognition/hw_asr/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..46d3a156a78c6ef994a0ba7e92a334a4a6b16b8b
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/utils/__init__.py
@@ -0,0 +1 @@
+from .util import *
diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-310.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d8ebcc423b144fd994dd785352edec675601bc7
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-311.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6be4d757ad564f9dcdf665ccef09849a1be1e8b
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-310.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb66d7f2a8fea3a2a8d2c541cce73dc539acf7fe
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-311.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..014cb4af46bfbe4ce1441a78f09fb5ad45c7767c
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/object_loading.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-310.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..282688690bd6b55c539316af10a6b23b918a3c64
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-311.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9c32fb1cd1d1717df562665566888553a6e1867
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/parse_config.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-310.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aeb0f4198abbe5473f3282ee9095be432fc4baae
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-310.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-311.pyc b/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8566bd2b51c23eac3f07533d45fec0a4396e8b6d
Binary files /dev/null and b/automatic-speech-recognition/hw_asr/utils/__pycache__/util.cpython-311.pyc differ
diff --git a/automatic-speech-recognition/hw_asr/utils/object_loading.py b/automatic-speech-recognition/hw_asr/utils/object_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..0107c56f17cdd288a6139e1d6a1ae6bfa28cbca3
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/utils/object_loading.py
@@ -0,0 +1,63 @@
+from operator import xor
+
+from torch.utils.data import ConcatDataset, DataLoader
+
+import hw_asr.augmentations
+import hw_asr.datasets
+from hw_asr import batch_sampler as batch_sampler_module
+from hw_asr.base.base_text_encoder import BaseTextEncoder
+from hw_asr.collate_fn.collate import collate_fn
+from hw_asr.utils.parse_config import ConfigParser
+
+
+def get_dataloaders(configs: ConfigParser, text_encoder: BaseTextEncoder):
+    dataloaders = {}
+    for split, params in configs["data"].items():
+        num_workers = params.get("num_workers", 1)
+
+        # set train augmentations
+        if split == 'train':
+            wave_augs, spec_augs = hw_asr.augmentations.from_configs(configs)
+            drop_last = True
+        else:
+            wave_augs, spec_augs = None, None
+            drop_last = False
+
+        # create and join datasets
+        datasets = []
+        for ds in params["datasets"]:
+            datasets.append(configs.init_obj(
+                ds, hw_asr.datasets, text_encoder=text_encoder, config_parser=configs,
+                wave_augs=wave_augs, spec_augs=spec_augs))
+        assert len(datasets)
+        if len(datasets) > 1:
+            dataset = ConcatDataset(datasets)
+        else:
+            dataset = datasets[0]
+
+        # select batch size or batch sampler
+        assert xor("batch_size" in params, "batch_sampler" in params), \
+            "You must provide batch_size or batch_sampler for each split"
+        if "batch_size" in params:
+            bs = params["batch_size"]
+            shuffle = True
+            batch_sampler = None
+        elif "batch_sampler" in params:
+            batch_sampler = configs.init_obj(params["batch_sampler"], batch_sampler_module,
+                                             data_source=dataset)
+            bs, shuffle = 1, False
+        else:
+            raise Exception()
+
+        # Fun fact. An hour of debugging was wasted to write this line
+        assert bs <= len(dataset), \
+            f"Batch size ({bs}) shouldn't be larger than dataset length ({len(dataset)})"
+
+        # create dataloader
+        dataloader = DataLoader(
+            dataset, batch_size=bs, collate_fn=collate_fn,
+            shuffle=shuffle, num_workers=num_workers,
+            batch_sampler=batch_sampler, drop_last=drop_last
+        )
+        dataloaders[split] = dataloader
+    return dataloaders
diff --git a/automatic-speech-recognition/hw_asr/utils/parse_config.py b/automatic-speech-recognition/hw_asr/utils/parse_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e8f2d4f2137d79a96e395b6db601887414d31b
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/utils/parse_config.py
@@ -0,0 +1,203 @@
+import importlib
+import json
+import logging
+import os
+from datetime import datetime
+from functools import reduce, partial
+from operator import getitem
+from pathlib import Path
+
+from hw_asr import text_encoder as text_encoder_module
+from hw_asr.base.base_text_encoder import BaseTextEncoder
+from hw_asr.logger import setup_logging
+from hw_asr.text_encoder import CTCCharTextEncoder
+from hw_asr.utils import read_json, write_json, ROOT_PATH
+
+
+class ConfigParser:
+    def __init__(self, config, resume=None, modification=None, run_id=None):
+        """
+        class to parse configuration json file. Handles hyperparameters for training,
+        initializations of modules, checkpoint saving and logging module.
+        :param config: Dict containing configurations, hyperparameters for training.
+                       contents of `config.json` file for example.
+        :param resume: String, path to the checkpoint being loaded.
+        :param modification: Dict {keychain: value}, specifying position values to be replaced
+                             from config dict.
+        :param run_id: Unique Identifier for training processes.
+                       Used to save checkpoints and training log. Timestamp is being used as default
+        """
+        # load config file and apply modification
+        self._config = _update_config(config, modification)
+        self.resume = resume
+        self._text_encoder = None
+
+        # set save_dir where trained model and log will be saved.
+        save_dir = Path(self.config["trainer"]["save_dir"])
+
+        exper_name = self.config["name"]
+        if run_id is None:  # use timestamp as default run-id
+            run_id = datetime.now().strftime(r"%m%d_%H%M%S")
+        self._save_dir = str(save_dir / "models" / exper_name / run_id)
+        self._log_dir = str(save_dir / "log" / exper_name / run_id)
+
+        # make directory for saving checkpoints and log.
+        exist_ok = run_id == ""
+        self.save_dir.mkdir(parents=True, exist_ok=exist_ok)
+        self.log_dir.mkdir(parents=True, exist_ok=exist_ok)
+
+        # save updated config file to the checkpoint dir
+        write_json(self.config, self.save_dir / "config.json")
+
+        # configure logging module
+        setup_logging(self.log_dir)
+        self.log_levels = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG}
+
+    @classmethod
+    def from_args(cls, args, options=""):
+        """
+        Initialize this class from some cli arguments. Used in train, test.
+        """
+        for opt in options:
+            args.add_argument(*opt.flags, default=None, type=opt.type)
+        if not isinstance(args, tuple):
+            args = args.parse_args()
+
+        if args.device is not None:
+            os.environ["CUDA_VISIBLE_DEVICES"] = args.device
+        if args.resume is not None:
+            resume = Path(args.resume)
+            cfg_fname = resume.parent / "config.json"
+        else:
+            msg_no_cfg = "Configuration file need to be specified. " \
+                         "Add '-c config.json', for example."
+            assert args.config is not None, msg_no_cfg
+            resume = None
+            cfg_fname = Path(args.config)
+
+        config = read_json(cfg_fname)
+        if args.config and resume:
+            # update new config for fine-tuning
+            config.update(read_json(args.config))
+
+        # parse custom cli options into dictionary
+        modification = {
+            opt.target: getattr(args, _get_opt_name(opt.flags)) for opt in options
+        }
+        return cls(config, resume, modification)
+
+    @staticmethod
+    def init_obj(obj_dict, default_module, *args, **kwargs):
+        """
+        Finds a function handle with the name given as 'type' in config, and returns the
+        instance initialized with corresponding arguments given.
+
+        `object = config.init_obj(config['param'], module, a, b=1)`
+        is equivalent to
+        `object = module.name(a, b=1)`
+        """
+        if "module" in obj_dict:
+            default_module = importlib.import_module(obj_dict["module"])
+
+        module_name = obj_dict["type"]
+        module_args = dict(obj_dict["args"])
+        assert all(
+            [k not in module_args for k in kwargs]
+        ), "Overwriting kwargs given in config file is not allowed"
+        module_args.update(kwargs)
+        return getattr(default_module, module_name)(*args, **module_args)
+
+    def init_ftn(self, name, module, *args, **kwargs):
+        """
+        Finds a function handle with the name given as 'type' in config, and returns the
+        function with given arguments fixed with functools.partial.
+
+        `function = config.init_ftn('name', module, a, b=1)`
+        is equivalent to
+        `function = lambda *args, **kwargs: module.name(a, *args, b=1, **kwargs)`.
+        """
+        module_name = self[name]["type"]
+        module_args = dict(self[name]["args"])
+        assert all(
+            [k not in module_args for k in kwargs]
+        ), "Overwriting kwargs given in config file is not allowed"
+        module_args.update(kwargs)
+        return partial(getattr(module, module_name), *args, **module_args)
+
+    def __getitem__(self, name):
+        """Access items like ordinary dict."""
+        return self.config[name]
+
+    def get_logger(self, name, verbosity=2):
+        msg_verbosity = "verbosity option {} is invalid. Valid options are {}.".format(
+            verbosity, self.log_levels.keys()
+        )
+        assert verbosity in self.log_levels, msg_verbosity
+        logger = logging.getLogger(name)
+        logger.setLevel(self.log_levels[verbosity])
+        return logger
+
+    def get_text_encoder(self) -> BaseTextEncoder:
+        if self._text_encoder is None:
+            if "text_encoder" not in self._config:
+                self._text_encoder = CTCCharTextEncoder()
+            elif self._config["text_encoder"] == "CTCCharTextEncoder":
+                self._text_encoder = CTCCharTextEncoder(self._config["text_encoder"]["args"])
+            else:
+                self._text_encoder = self.init_obj(self["text_encoder"],
+                                                   default_module=text_encoder_module)
+        return self._text_encoder
+
+    # setting read-only attributes
+    @property
+    def config(self):
+        return self._config
+
+    @property
+    def save_dir(self):
+        return Path(self._save_dir)
+
+    @property
+    def log_dir(self):
+        return Path(self._log_dir)
+
+    @classmethod
+    def get_default_configs(cls):
+        config_path = ROOT_PATH / "hw_asr" / "config.json"
+        with config_path.open() as f:
+            return cls(json.load(f))
+
+    @classmethod
+    def get_test_configs(cls):
+        config_path = ROOT_PATH / "hw_asr" / "tests" / "config.json"
+        with config_path.open() as f:
+            return cls(json.load(f))
+
+
+# helper functions to update config dict with custom cli options
+def _update_config(config, modification):
+    if modification is None:
+        return config
+
+    for k, v in modification.items():
+        if v is not None:
+            _set_by_path(config, k, v)
+    return config
+
+
+def _get_opt_name(flags):
+    for flg in flags:
+        if flg.startswith("--"):
+            return flg.replace("--", "")
+    return flags[0].replace("--", "")
+
+
+def _set_by_path(tree, keys, value):
+    """Set a value in a nested object in tree by sequence of keys."""
+    keys = keys.split(";")
+    _get_by_path(tree, keys[:-1])[keys[-1]] = value
+
+
+def _get_by_path(tree, keys):
+    """Access a nested object in tree by sequence of keys."""
+    return reduce(getitem, keys, tree)
diff --git a/automatic-speech-recognition/hw_asr/utils/util.py b/automatic-speech-recognition/hw_asr/utils/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f50d31451d689fef13dacbd33892ad02794851c
--- /dev/null
+++ b/automatic-speech-recognition/hw_asr/utils/util.py
@@ -0,0 +1,82 @@
+import json
+from collections import OrderedDict
+from itertools import repeat
+from pathlib import Path
+
+import pandas as pd
+import torch
+
+ROOT_PATH = Path(__file__).absolute().resolve().parent.parent.parent
+
+
+def ensure_dir(dirname):
+    dirname = Path(dirname)
+    if not dirname.is_dir():
+        dirname.mkdir(parents=True, exist_ok=False)
+
+
+def read_json(fname):
+    fname = Path(fname)
+    with fname.open("rt") as handle:
+        return json.load(handle, object_hook=OrderedDict)
+
+
+def write_json(content, fname):
+    fname = Path(fname)
+    with fname.open("wt") as handle:
+        json.dump(content, handle, indent=4, sort_keys=False)
+
+
+def inf_loop(data_loader):
+    """wrapper function for endless data loader."""
+    for loader in repeat(data_loader):
+        yield from loader
+
+
+def prepare_device(n_gpu_use):
+    """
+    setup GPU device if available. get gpu device indices which are used for DataParallel
+    """
+    n_gpu = torch.cuda.device_count()
+    if n_gpu_use > 0 and n_gpu == 0:
+        print(
+            "Warning: There's no GPU available on this machine,"
+            "training will be performed on CPU."
+        )
+        n_gpu_use = 0
+    if n_gpu_use > n_gpu:
+        print(
+            f"Warning: The number of GPU's configured to use is {n_gpu_use}, but only {n_gpu} are "
+            "available on this machine."
+        )
+        n_gpu_use = n_gpu
+    device = torch.device("cuda:0" if n_gpu_use > 0 else "cpu")
+    list_ids = list(range(n_gpu_use))
+    return device, list_ids
+
+
+class MetricTracker:
+    def __init__(self, *keys, writer=None):
+        self.writer = writer
+        self._data = pd.DataFrame(index=keys, columns=["total", "counts", "average"])
+        self.reset()
+
+    def reset(self):
+        for col in self._data.columns:
+            self._data[col].values[:] = 0
+
+    def update(self, key, value, n=1):
+        # if self.writer is not None:
+        #     self.writer.add_scalar(key, value)
+        self._data.total[key] += value * n
+        self._data.counts[key] += n
+        self._data.average[key] = self._data.total[key] / self._data.counts[key]
+
+    def avg(self, key):
+        return self._data.average[key]
+
+    def result(self):
+        return dict(self._data.average)
+
+    def keys(self):
+        return self._data.total.keys()
diff --git a/automatic-speech-recognition/requirements.txt b/automatic-speech-recognition/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ad39fc449461551a1d1cdea08775d71c2266d9ea
--- /dev/null
+++ b/automatic-speech-recognition/requirements.txt
@@ -0,0 +1,17 @@
+torch==2.1.0
+torchvision==0.16.0
+numpy
+tqdm
+tensorboard
+matplotlib
+pandas
+
+speechbrain==0.5.15
+datasets
+torch_audiomentations
+editdistance
+wandb
+pyctcdecode
+torchaudio==2.1.0
+pillow
+kenlm
\ No newline at end of file
diff --git a/automatic-speech-recognition/test.py b/automatic-speech-recognition/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..550eaec2dfee8d30c299707326746d61c7195f61
--- /dev/null
+++ b/automatic-speech-recognition/test.py
@@ -0,0 +1,202 @@
+import argparse
+import json
+import os
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+import hw_asr.model as module_model
+from hw_asr.trainer import Trainer
+from hw_asr.utils import ROOT_PATH
+from hw_asr.utils.object_loading import get_dataloaders
+from hw_asr.utils.parse_config import ConfigParser
+from hw_asr.metric.utils import calc_wer
+
+DEFAULT_CHECKPOINT_PATH = ROOT_PATH / "default_test_model" / "checkpoint.pth"
+
+
+def main(config, out_file):
+    logger = config.get_logger("test")
+
+    # define cpu or gpu if possible
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # text_encoder
+    text_encoder = config.get_text_encoder()
+
+    # setup data_loader instances
+    dataloaders = get_dataloaders(config, text_encoder)
+
+    # build model architecture
+    model = config.init_obj(config["arch"], module_model, n_class=len(text_encoder))
+    logger.info(model)
+
+    logger.info("Loading checkpoint: {} ...".format(config.resume))
+    checkpoint = torch.load(config.resume, map_location=device)
+    state_dict = checkpoint["state_dict"]
+    if config["n_gpu"] > 1:
+        model = torch.nn.DataParallel(model)
+    model.load_state_dict(state_dict)
+
+    # prepare model for testing
+    model = model.to(device)
+    model.eval()
+
+    results = []
+
+    argmax_wer_sum = 0
+    beam_search_wer_sum = 0
+    lm_wer_sum = 0
+
+    with torch.no_grad():
+        for batch_num, batch in enumerate(tqdm(dataloaders["test"])):
+            batch = Trainer.move_batch_to_device(batch, device)
+            output = model(**batch)
+            if type(output) is dict:
+                batch.update(output)
+            else:
+                batch["logits"] = output
+            batch["log_probs"] = torch.log_softmax(batch["logits"], dim=-1)
+            batch["log_probs_length"] = model.transform_input_lengths(batch["spectrogram_length"])
+            batch["probs"] = batch["log_probs"].exp().cpu()
+            batch["argmax"] = batch["probs"].argmax(-1)
+            for i in range(len(batch["text"])):
+                length = int(batch["log_probs_length"][i])
+                ground_truth = batch["text"][i]
+
+                argmax = batch["argmax"][i][:length].cpu().numpy()
+                text_argmax = text_encoder.ctc_decode(argmax)
+
+                probs = batch["probs"][i][:length].detach().cpu().numpy()
+                text_beam_search = text_encoder.ctc_beam_search(probs, beam_size=4)
+
+                logits = batch["logits"][i][:length].detach().cpu().numpy()
+                text_lm = text_encoder.ctc_lm_beam_search(logits)
+
+                argmax_wer = calc_wer(ground_truth, text_argmax) * 100
+                beam_search_wer = calc_wer(ground_truth, text_beam_search) * 100
+                lm_wer = calc_wer(ground_truth, text_lm) * 100
+
+                argmax_wer_sum += argmax_wer
+                beam_search_wer_sum += beam_search_wer
+                lm_wer_sum += lm_wer
+
+                results.append(
+                    {
+                        "ground_truth": ground_truth,
+                        "pred_text_argmax": text_argmax,
+                        "pred_text_beam_search": text_beam_search,
+                        "pred_text_lm": text_lm,
+                        "argmax_wer": argmax_wer,
+                        "beam_search_wer": beam_search_wer,
+                        "lm_wer": lm_wer,
+                    }
+                )
+
+    n = len(results)
+    logger.info("argmax_wer_mean:")
+    logger.info(argmax_wer_sum / n)
+    logger.info("beam_search_wer_mean:")
+    logger.info(beam_search_wer_sum / n)
+    logger.info("lm_wer_mean:")
+    logger.info(lm_wer_sum / n)
+
+    with Path(out_file).open("w") as f:
+        json.dump(results, f, indent=2)
+
+
+if __name__ == "__main__":
+    args = argparse.ArgumentParser(description="PyTorch Template")
+    args.add_argument(
+        "-c",
+        "--config",
+        default=None,
+        type=str,
+        help="config file path (default: None)",
+    )
+    args.add_argument(
+        "-r",
+        "--resume",
+        default=str(DEFAULT_CHECKPOINT_PATH.absolute().resolve()),
+        type=str,
+        help="path to latest checkpoint (default: None)",
+    )
+    args.add_argument(
+        "-d",
+        "--device",
+        default=None,
+        type=str,
+        help="indices of GPUs to enable (default: all)",
+    )
+    args.add_argument(
+        "-o",
+        "--output",
+        default="output.json",
+        type=str,
+        help="File to write results (.json)",
+    )
+    args.add_argument(
+        "-t",
+        "--test-data-folder",
+        default=None,
+        type=str,
+        help="Path to dataset",
+    )
+    args.add_argument(
+        "-b",
+        "--batch-size",
+        default=20,
+        type=int,
+        help="Test dataset batch size",
+    )
+    args.add_argument(
+        "-j",
+        "--jobs",
+        default=1,
+        type=int,
+        help="Number of workers for test dataloader",
+    )
+
+    args = args.parse_args()
+
+    # set GPUs
+    if args.device is not None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.device
+
+    # first, we need to obtain config with model parameters
+    # we assume it is located with checkpoint in the same folder
+    model_config = Path(args.resume).parent / "config.json"
+    with model_config.open() as f:
+        config = ConfigParser(json.load(f), resume=args.resume)
+
+    # update with addition configs from `args.config` if provided
+    if args.config is not None:
+        with Path(args.config).open() as f:
+            config.config.update(json.load(f))
+
+    # if `--test-data-folder` was provided, set it as a default test set
+    if args.test_data_folder is not None:
+        test_data_folder = Path(args.test_data_folder).absolute().resolve()
+        assert test_data_folder.exists()
+        config.config["data"] = {
+            "test": {
+                "batch_size": args.batch_size,
+                "num_workers": args.jobs,
+                "datasets": [
+                    {
+                        "type": "CustomDirAudioDataset",
+                        "args": {
+                            "audio_dir": str(test_data_folder / "audio"),
+                            "transcription_dir": str(test_data_folder / "transcriptions"),
+                        },
+                    }
+                ],
+            }
+        }
+
+    assert config.config.get("data", {}).get("test", None) is not None
+    config["data"]["test"]["batch_size"] = args.batch_size
+    config["data"]["test"]["n_jobs"] = args.jobs
+
+    main(config, args.output)
diff --git a/automatic-speech-recognition/train.py b/automatic-speech-recognition/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46211f809d7c8cc146b8a1a62ea488ed5a7ee29
--- /dev/null
+++ b/automatic-speech-recognition/train.py
@@ -0,0 +1,107 @@
+import argparse
+import collections
+import warnings
+
+import numpy as np
+import torch
+
+import hw_asr.loss as module_loss
+import hw_asr.metric as module_metric
+import hw_asr.model as module_arch
+from hw_asr.trainer import Trainer
+from hw_asr.utils import prepare_device
+from hw_asr.utils.object_loading import get_dataloaders
+from hw_asr.utils.parse_config import ConfigParser
+
+warnings.filterwarnings("ignore", category=UserWarning)
+
+# fix random seeds for reproducibility
+SEED = 123
+torch.manual_seed(SEED)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+np.random.seed(SEED)
+
+
+def main(config):
+    logger = config.get_logger("train")
+
+    # text_encoder
+    text_encoder = config.get_text_encoder()
+
+    # setup data_loader instances
+    dataloaders = get_dataloaders(config, text_encoder)
+
+    # build model architecture, then print to console
+    model = config.init_obj(config["arch"], module_arch, n_class=len(text_encoder))
+    logger.info(model)
+
+    # prepare for (multi-device) GPU training
+    device, device_ids = prepare_device(config["n_gpu"])
+    model = model.to(device)
+    if len(device_ids) > 1:
+        model = torch.nn.DataParallel(model, device_ids=device_ids)
+
+    # get function handles of loss and metrics
+    loss_module = config.init_obj(config["loss"], module_loss).to(device)
+    metrics = [
+        config.init_obj(metric_dict, module_metric, text_encoder=text_encoder)
+        for metric_dict in config["metrics"]
+    ]
+
+    # build optimizer, learning rate scheduler. delete every line containing lr_scheduler for
+    # disabling scheduler
+    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
+    optimizer = config.init_obj(config["optimizer"], torch.optim, trainable_params)
+    lr_scheduler = config.init_obj(config["lr_scheduler"], torch.optim.lr_scheduler, optimizer)
+
+    trainer = Trainer(
+        model,
+        loss_module,
+        metrics,
+        optimizer,
+        text_encoder=text_encoder,
+        config=config,
+        device=device,
+        dataloaders=dataloaders,
+        lr_scheduler=lr_scheduler,
+        len_epoch=config["trainer"].get("len_epoch", None)
+    )
+
+    trainer.train()
+
+
+if __name__ == "__main__":
+    args = argparse.ArgumentParser(description="PyTorch Template")
+    args.add_argument(
+        "-c",
+        "--config",
+        default=None,
+        type=str,
+        help="config file path (default: None)",
+    )
+    args.add_argument(
+        "-r",
+        "--resume",
+        default=None,
+        type=str,
+        help="path to latest checkpoint (default: None)",
+    )
+    args.add_argument(
+        "-d",
+        "--device",
+        default=None,
+        type=str,
+        help="indices of GPUs to enable (default: all)",
+    )
+
+    # custom cli options to modify configuration from default values given in json file.
+    CustomArgs = collections.namedtuple("CustomArgs", "flags type target")
+    options = [
+        CustomArgs(["--lr", "--learning_rate"], type=float, target="optimizer;args;lr"),
+        CustomArgs(
+            ["--bs", "--batch_size"], type=int, target="data_loader;args;batch_size"
+        ),
+    ]
+    config = ConfigParser.from_args(args, options)
+    main(config)