tgritsaev
/

audio_check

Model card Files Files and versions Community

tgritsaev commited on May 14, 2024

Commit

affcd23

verified ·

1 Parent(s): 574d505

Upload 198 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

automatic-speech-recognition/Dockerfile +15 -0
automatic-speech-recognition/LICENSE +21 -0
automatic-speech-recognition/README.md +21 -0
automatic-speech-recognition/checkpoint.pth +3 -0
automatic-speech-recognition/default_test_config.json +188 -0
automatic-speech-recognition/default_test_model/config.json +242 -0
automatic-speech-recognition/hw_asr/__init__.py +0 -0
automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__init__.py +36 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/base.py +6 -0
automatic-speech-recognition/hw_asr/augmentations/random_apply.py +16 -0
automatic-speech-recognition/hw_asr/augmentations/random_choice.py +17 -0
automatic-speech-recognition/hw_asr/augmentations/sequential.py +16 -0
automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py +17 -0
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py +11 -0
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py +11 -0
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py +7 -0
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py +13 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py +13 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py +13 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py +13 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py +13 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py +13 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py +13 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py +13 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py +19 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc +0 -0
automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc +0 -0

automatic-speech-recognition/Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-devel
+WORKDIR /repos/asr_project_template
+# Install requirements for torchaudio
+RUN pip install sox && conda install torchaudio==0.11.0 -c pytorch && conda install -c conda-forge librosa
+# Install requirements
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+# Copy the contents of repository
+COPY . .
+# Expose port
+EXPOSE 3000

automatic-speech-recognition/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Daniil Ivanov
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

automatic-speech-recognition/README.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# ASR project barebones
+## Installation guide
+1. `pip install -r ./requirements.txt`
+2. Download from http://www.openslr.org/11/ `3-gram.arpa.gz` and `librispeech-vocab.txt`
+3. `python hw_asr/text_encoder/fix_vocab.py` and `python hw_asr/text_encoder/lower_model.py` to prepare vocab and model for using
+4. If you want to test my model, download it from the https://drive.google.com/file/d/1QrSsx56V5YNjGHUBWy6CIRVbNbjKWUpJ/view?usp=share_link , name it `checkpoint.pth` and place to the directory `default_test_model/`
+## Train
+1. `python train.py --config hw_asr/configs/config2.json`
+## Test
+1. `python test.py -c default_test_config.json -r default_test_model/checkpoint.pth`
+## Wandb report
+1. You can check my wandb report (only on Russian) and wandb project from the https://wandb.ai/tgritsaev/asr_project/reports/DLA-HW-1--Vmlldzo1NzY3NjA5?accessToken=kotkj5oyzomf2d2g1f40mczdnpirwvuw1f538zx9k491g1cfh3wg9iwhsb65o054

automatic-speech-recognition/checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f1bebf1c95bb69c3130757b652c9fa4a975302b142a0b3d779d36ba404905ac
+size 333205079

automatic-speech-recognition/default_test_config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+    "name": "default_test_config",
+    "n_gpu": 1,
+    "text_encoder": {
+        "type": "CTCCharTextEncoder",
+        "args": {
+            "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa",
+            "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt"
+        }
+    },
+    "preprocessing": {
+        "sr": 16000,
+        "spectrogram": {
+            "type": "MelSpectrogram",
+            "args": {
+                "n_mels": 256
+            }
+        },
+        "log_spec": true
+    },
+    "augmentations": {
+        "random_apply_p": 0.6,
+        "wave": [
+            {
+                "type": "AddColoredNoise",
+                "args": {
+                    "p": 1,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "Gain",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "HighPassFilter",
+                "args": {
+                    "p": 0,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "LowPassFilter",
+                "args": {
+                    "p": 0,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "PitchShift",
+                "args": {
+                    "p": 0.8,
+                    "min_transpose_semitones": -2,
+                    "max_transpose_semitones": 2,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "PolarityInversion",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "Shift",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            }
+        ],
+        "spectrogram": [
+            {
+                "type": "TimeMasking",
+                "args": {
+                    "time_mask_param": 80,
+                    "p": 0.05
+                }
+            },
+            {
+                "type": "FrequencyMasking",
+                "args": {
+                    "freq_mask_param": 80
+                }
+            }
+        ]
+    },
+    "arch": {
+        "type": "DeepSpeech2Model",
+        "args": {
+            "n_feats": 256,
+            "n_rnn_layers": 6,
+            "rnn_hidden_size": 512,
+            "rnn_dropout": 0.2
+        }
+    },
+    "data": {
+        "test": {
+            "batch_size": 64,
+            "num_workers": 4,
+            "datasets": [
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "test-other"
+                    }
+                }
+            ]
+        }
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "args": {
+            "lr": 0.0003,
+            "weight_decay": 1e-05
+        }
+    },
+    "loss": {
+        "type": "CTCLoss",
+        "args": {}
+    },
+    "metrics": [
+        {
+            "type": "ArgmaxWERMetric",
+            "args": {
+                "name": "WER (argmax)"
+            }
+        },
+        {
+            "type": "ArgmaxCERMetric",
+            "args": {
+                "name": "CER (argmax)"
+            }
+        },
+        {
+            "type": "BeamSearchWERMetric",
+            "args": {
+                "beam_size": 4,
+                "name": "WER (beam search)"
+            }
+        },
+        {
+            "type": "BeamSearchCERMetric",
+            "args": {
+                "beam_size": 4,
+                "name": "CER (beam search)"
+            }
+        },
+        {
+            "type": "LanguageModelWERMetric",
+            "args": {
+                "name": "WER (LM)"
+            }
+        },
+        {
+            "type": "LanguageModelCERMetric",
+            "args": {
+                "name": "CER (LM)"
+            }
+        }
+    ],
+    "lr_scheduler": {
+        "type": "OneCycleLR",
+        "args": {
+            "steps_per_epoch": 1000,
+            "epochs": 50,
+            "anneal_strategy": "cos",
+            "max_lr": 0.0003,
+            "pct_start": 0.1
+        }
+    },
+    "trainer": {
+        "epochs": 50,
+        "save_dir": "saved/",
+        "save_period": 5,
+        "verbosity": 2,
+        "monitor": "min val_loss",
+        "early_stop": 100,
+        "visualize": "wandb",
+        "wandb_project": "asr_project",
+        "len_epoch": 1000,
+        "grad_norm_clip": 10
+    }
+}

automatic-speech-recognition/default_test_model/config.json ADDED Viewed

	@@ -0,0 +1,242 @@

+{
+    "name": "default_config",
+    "n_gpu": 1,
+    "text_encoder": {
+        "type": "CTCCharTextEncoder",
+        "args": {
+            "kenlm_model_path": "hw_asr/text_encoder/lower_3-gram.arpa",
+            "unigrams_path": "hw_asr/text_encoder/librispeech-fixed-vocab.txt"
+        }
+    },
+    "preprocessing": {
+        "sr": 16000,
+        "spectrogram": {
+            "type": "MelSpectrogram",
+            "args": {
+                "n_mels": 256
+            }
+        },
+        "log_spec": true
+    },
+    "augmentations": {
+        "random_apply_p": 0.6,
+        "wave": [
+            {
+                "type": "AddColoredNoise",
+                "args": {
+                    "p": 1,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "Gain",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "HighPassFilter",
+                "args": {
+                    "p": 0,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "LowPassFilter",
+                "args": {
+                    "p": 0,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "PitchShift",
+                "args": {
+                    "p": 0.8,
+                    "min_transpose_semitones": -2,
+                    "max_transpose_semitones": 2,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "PolarityInversion",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            },
+            {
+                "type": "Shift",
+                "args": {
+                    "p": 0.8,
+                    "sample_rate": 16000
+                }
+            }
+        ],
+        "spectrogram": [
+            {
+                "type": "TimeMasking",
+                "args": {
+                    "time_mask_param": 80,
+                    "p": 0.05
+                }
+            },
+            {
+                "type": "FrequencyMasking",
+                "args": {
+                    "freq_mask_param": 80
+                }
+            }
+        ]
+    },
+    "arch": {
+        "type": "DeepSpeech2Model",
+        "args": {
+            "n_feats": 256,
+            "n_rnn_layers": 6,
+            "rnn_hidden_size": 512,
+            "rnn_dropout": 0.2
+        }
+    },
+    "data": {
+        "train": {
+            "batch_size": 128,
+            "num_workers": 4,
+            "datasets": [
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "train-clean-100",
+                        "max_audio_length": 40.0,
+                        "max_text_length": 400
+                    }
+                },
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "train-clean-360",
+                        "max_audio_length": 40.0,
+                        "max_text_length": 400
+                    }
+                },
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "train-other-500",
+                        "max_audio_length": 40.0,
+                        "max_text_length": 400
+                    }
+                }
+            ]
+        },
+        "val": {
+            "batch_size": 64,
+            "num_workers": 4,
+            "datasets": [
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "dev-clean"
+                    }
+                }
+            ]
+        },
+        "test-other": {
+            "batch_size": 64,
+            "num_workers": 4,
+            "datasets": [
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "test-other"
+                    }
+                }
+            ]
+        },
+        "test-clean": {
+            "batch_size": 64,
+            "num_workers": 4,
+            "datasets": [
+                {
+                    "type": "LibrispeechDataset",
+                    "args": {
+                        "part": "test-clean"
+                    }
+                }
+            ]
+        }
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "args": {
+            "lr": 0.0003,
+            "weight_decay": 1e-05
+        }
+    },
+    "loss": {
+        "type": "CTCLoss",
+        "args": {}
+    },
+    "metrics": [
+        {
+            "type": "ArgmaxWERMetric",
+            "args": {
+                "name": "WER (argmax)"
+            }
+        },
+        {
+            "type": "ArgmaxCERMetric",
+            "args": {
+                "name": "CER (argmax)"
+            }
+        },
+        {
+            "type": "BeamSearchWERMetric",
+            "args": {
+                "beam_size": 4,
+                "name": "WER (beam search)"
+            }
+        },
+        {
+            "type": "BeamSearchCERMetric",
+            "args": {
+                "beam_size": 4,
+                "name": "CER (beam search)"
+            }
+        },
+        {
+            "type": "LanguageModelWERMetric",
+            "args": {
+                "name": "WER (LM)"
+            }
+        },
+        {
+            "type": "LanguageModelCERMetric",
+            "args": {
+                "name": "CER (LM)"
+            }
+        }
+    ],
+    "lr_scheduler": {
+        "type": "OneCycleLR",
+        "args": {
+            "steps_per_epoch": 1000,
+            "epochs": 50,
+            "anneal_strategy": "cos",
+            "max_lr": 0.0003,
+            "pct_start": 0.1
+        }
+    },
+    "trainer": {
+        "epochs": 50,
+        "save_dir": "saved/",
+        "save_period": 5,
+        "verbosity": 2,
+        "monitor": "min val_loss",
+        "early_stop": 100,
+        "visualize": "wandb",
+        "wandb_project": "asr_project",
+        "len_epoch": 1000,
+        "grad_norm_clip": 10
+    }
+}

automatic-speech-recognition/hw_asr/__init__.py ADDED Viewed

File without changes

automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (150 Bytes). View file

automatic-speech-recognition/hw_asr/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (166 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from collections.abc import Callable
+from typing import List
+import hw_asr.augmentations.spectrogram_augmentations
+import hw_asr.augmentations.wave_augmentations
+from hw_asr.augmentations.random_choice import RandomChoice
+from hw_asr.augmentations.sequential_random_apply import SequentialRandomApply
+# from hw_asr.augmentations.sequential import SequentialAugmentation
+# from hw_asr.augmentations.random_apply import RandomApply
+from hw_asr.utils.parse_config import ConfigParser
+def from_configs(configs: ConfigParser):
+    wave_augs = []
+    if "augmentations" in configs.config and "wave" in configs.config["augmentations"]:
+        for aug_dict in configs.config["augmentations"]["wave"]:
+            wave_augs.append(
+                configs.init_obj(aug_dict, hw_asr.augmentations.wave_augmentations)
+            )
+    spec_augs = []
+    if "augmentations" in configs.config and "spectrogram" in configs.config["augmentations"]:
+        for aug_dict in configs.config["augmentations"]["spectrogram"]:
+            spec_augs.append(
+                configs.init_obj(aug_dict, hw_asr.augmentations.spectrogram_augmentations)
+            )
+    return _to_function(RandomChoice, wave_augs, configs.config["augmentations"]["random_apply_p"]), _to_function(SequentialRandomApply, spec_augs, configs.config["augmentations"]["random_apply_p"])
+def _to_function(random_type, augs_list: List[Callable], p: float):
+    if len(augs_list) == 0:
+        return None
+    elif len(augs_list) == 1:
+        return augs_list[0]
+    else:
+        return random_type(augs_list, p)

automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.35 kB). View file

automatic-speech-recognition/hw_asr/augmentations/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.36 kB). View file

automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (541 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (710 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-310.pyc ADDED Viewed

Binary file (848 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_apply.cpython-311.pyc ADDED Viewed

Binary file (1.22 kB). View file

automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-310.pyc ADDED Viewed

Binary file (954 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/__pycache__/random_choice.cpython-311.pyc ADDED Viewed

Binary file (1.36 kB). View file

automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-310.pyc ADDED Viewed

Binary file (895 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential.cpython-311.pyc ADDED Viewed

Binary file (1.19 kB). View file

automatic-speech-recognition/hw_asr/augmentations/__pycache__/sequential_random_apply.cpython-311.pyc ADDED Viewed

Binary file (1.38 kB). View file

automatic-speech-recognition/hw_asr/augmentations/base.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from torch import Tensor
+class AugmentationBase:
+    def __call__(self, data: Tensor) -> Tensor:
+        raise NotImplementedError()

automatic-speech-recognition/hw_asr/augmentations/random_apply.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import random
+from typing import Callable
+from torch import Tensor
+class RandomApply:
+    def __init__(self, augmentation: Callable, p: float):
+        assert 0 <= p <= 1
+        self.augmentation = augmentation
+        self.p = p
+    def __call__(self, data: Tensor) -> Tensor:
+        if random.random() < self.p:
+            return self.augmentation(data)
+        else:
+            return data

automatic-speech-recognition/hw_asr/augmentations/random_choice.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from typing import List, Callable
+from torch import Tensor
+import random
+from hw_asr.augmentations.base import AugmentationBase
+class RandomChoice(AugmentationBase):
+    def __init__(self, augmentation_list: List[Callable], p: float):
+        self.augmentation_list = augmentation_list
+        self.p = p
+    def __call__(self, data: Tensor) -> Tensor:
+        x = data
+        if random.random() < self.p:
+            augmentation = random.choice(self.augmentation_list)
+            x = augmentation(x)
+        return x

automatic-speech-recognition/hw_asr/augmentations/sequential.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List, Callable
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+class SequentialAugmentation(AugmentationBase):
+    def __init__(self, augmentation_list: List[Callable]):
+        self.augmentation_list = augmentation_list
+    def __call__(self, data: Tensor) -> Tensor:
+        x = data
+        for augmentation in self.augmentation_list:
+            x = augmentation(x)
+        return x

automatic-speech-recognition/hw_asr/augmentations/sequential_random_apply.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from typing import List, Callable
+from torch import Tensor
+import random
+from hw_asr.augmentations.base import AugmentationBase
+class SequentialRandomApply(AugmentationBase):
+    def __init__(self, augmentation_list: List[Callable], p: float = 0.5):
+        self.augmentation_list = augmentation_list
+        self.p = p
+    def __call__(self, data: Tensor) -> Tensor:
+        x = data
+        for augmentation in self.augmentation_list:
+            if random.random() < self.p:
+                x = augmentation(x)
+        return x

automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/FrequencyMasking.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+from torchaudio import transforms
+class FrequencyMasking(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = transforms.FrequencyMasking(*args, **kwargs)
+    def __call__(self, spectogram: Tensor):
+        return self._aug(spectogram).squeeze(1)

automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/TimeMasking.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+from torchaudio import transforms
+class TimeMasking(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = transforms.TimeMasking(*args, **kwargs)
+    def __call__(self, spectogram: Tensor):
+        return self._aug(spectogram).squeeze(1)

automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from hw_asr.augmentations.spectrogram_augmentations.TimeMasking import TimeMasking
+from hw_asr.augmentations.spectrogram_augmentations.FrequencyMasking import FrequencyMasking
+__all__ = [
+    "TimeMasking",
+    "FrequencyMasking"
+]

automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/FrequencyMasking.cpython-311.pyc ADDED Viewed

Binary file (1.19 kB). View file

automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/TimeMasking.cpython-311.pyc ADDED Viewed

Binary file (1.17 kB). View file

automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (190 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/spectrogram_augmentations/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (474 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/AddColoredNoise.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch_audiomentations
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+class AddColoredNoise(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.AddColoredNoise(*args, **kwargs)
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Gain.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch_audiomentations
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+class Gain(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.Gain(*args, **kwargs)
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/HighPassFilter.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch_audiomentations
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+class HighPassFilter(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.HighPassFilter(*args, **kwargs)
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/LowPassFilter.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch_audiomentations
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+class LowPassFilter(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.LowPassFilter(*args, **kwargs)
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Padding.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch_audiomentations
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+class Padding(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.Padding(*args, **kwargs)
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PitchShift.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch_audiomentations
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+class PitchShift(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.PitchShift(*args, **kwargs)
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/PolarityInversion.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch_audiomentations
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+class PolarityInversion(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.PolarityInversion(*args, **kwargs)
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/Shift.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch_audiomentations
+from torch import Tensor
+from hw_asr.augmentations.base import AugmentationBase
+class Shift(AugmentationBase):
+    def __init__(self, *args, **kwargs):
+        self._aug = torch_audiomentations.Shift(*args, **kwargs)
+    def __call__(self, data: Tensor):
+        x = data.unsqueeze(1)
+        return self._aug(x).squeeze(1)

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from hw_asr.augmentations.wave_augmentations.AddColoredNoise import AddColoredNoise
+from hw_asr.augmentations.wave_augmentations.Gain import Gain
+from hw_asr.augmentations.wave_augmentations.HighPassFilter import HighPassFilter
+from hw_asr.augmentations.wave_augmentations.LowPassFilter import LowPassFilter
+# from hw_asr.augmentations.wave_augmentations.Padding import Padding
+from hw_asr.augmentations.wave_augmentations.PitchShift import PitchShift
+from hw_asr.augmentations.wave_augmentations.PolarityInversion import PolarityInversion
+from hw_asr.augmentations.wave_augmentations.Shift import Shift
+__all__ = [
+    "AddColoredNoise",
+    "Gain",
+    "HighPassFilter"
+    "LowPassFilter",
+    # "Padding",
+    "PitchShift",
+    "PolarityInversion",
+    "Shift"
+]

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-310.pyc ADDED Viewed

Binary file (863 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/AddColoredNoise.cpython-311.pyc ADDED Viewed

Binary file (1.23 kB). View file

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-310.pyc ADDED Viewed

Binary file (819 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/Gain.cpython-311.pyc ADDED Viewed

Binary file (1.19 kB). View file

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-310.pyc ADDED Viewed

Binary file (859 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/HighPassFilter.cpython-311.pyc ADDED Viewed

Binary file (1.23 kB). View file

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-310.pyc ADDED Viewed

Binary file (855 Bytes). View file

automatic-speech-recognition/hw_asr/augmentations/wave_augmentations/__pycache__/LowPassFilter.cpython-311.pyc ADDED Viewed

Binary file (1.22 kB). View file